diff options
Diffstat (limited to 'kernel')
67 files changed, 1679 insertions, 1163 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..08561f1acd13 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +config LOCK_SPIN_ON_OWNER + def_bool y + depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/audit.c b/kernel/audit.c index f8f203e8018c..72ab759a0b43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb) * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb) +static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *copy; struct audit_net *aunet = net_generic(&init_net, audit_net_id); @@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, GFP_KERNEL); + copy = skb_copy(skb, gfp_mask); if (!copy) return; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); } /* @@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb) } /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(int group) +static int audit_bind(struct net *net, int group) { if (!capable(CAP_AUDIT_READ)) return -EPERM; @@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab) struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); nlh->nlmsg_len = ab->skb->len; - kauditd_send_multicast_skb(ab->skb); + kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); /* * The original kaudit unicast socket sends up messages with diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3598e13f2a65..4f68a326d92e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; - } - - if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { - struct pid *pid; - rcu_read_lock(); - pid = find_vpid(f->val); - if (!pid) { - rcu_read_unlock(); - err = -ESRCH; - goto exit_free; - } - f->val = pid_nr(pid); - rcu_read_unlock(); + entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } err = audit_field_valid(entry, f); @@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_LOGINUID_SET: + if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { + data->fields[i] = AUDIT_LOGINUID; + data->values[i] = AUDIT_UID_UNSET; + break; + } + /* fallthrough if set */ default: data->values[i] = f->val; } @@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) int i; if (a->flags != b->flags || + a->pflags != b->pflags || a->listnr != b->listnr || a->action != b->action || a->field_count != b->field_count) @@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) new = &entry->rule; new->vers_ops = old->vers_ops; new->flags = old->flags; + new->pflags = old->pflags; new->listnr = old->listnr; new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c75522a83678..072566dd0caf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -72,6 +72,8 @@ #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> +#include <linux/string.h> +#include <uapi/linux/limits.h> #include "audit.h" @@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - /* does the name pointer match? */ - if (!n->name || n->name->name != name->name) + if (!n->name || strcmp(n->name->name, name->name)) continue; /* match the correct record type */ @@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } out_alloc: - /* unable to find the name from a previous getname(). Allocate a new - * anonymous entry. - */ - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); + /* unable to find an entry with both a matching name and type */ + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; + /* unfortunately, while we may have a path name to record with the + * inode, we can't always rely on the string lasting until the end of + * the syscall so we need to create our own copy, it may fail due to + * memory allocation issues, but we do our best */ + if (name) { + /* we can't use getname_kernel() due to size limits */ + size_t len = strlen(name->name) + 1; + struct filename *new = __getname(); + + if (unlikely(!new)) + goto out; + + if (len <= (PATH_MAX - sizeof(*new))) { + new->name = (char *)(new) + sizeof(*new); + new->separate = false; + } else if (len <= PATH_MAX) { + /* this looks odd, but is due to final_putname() */ + struct filename *new2; + + new2 = kmalloc(sizeof(*new2), GFP_KERNEL); + if (unlikely(!new2)) { + __putname(new); + goto out; + } + new2->name = (char *)new; + new2->separate = true; + new = new2; + } else { + /* we should never get here, but let's be safe */ + __putname(new); + goto out; + } + strlcpy((char *)new->name, name->name, len); + new->uptr = NULL; + new->aname = n; + n->name = new; + n->name_put = true; + } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6594e457a25..a64e7a207d2b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - module_free(NULL, hdr); + module_memfree(hdr); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 088ac0b1b106..536edc2be307 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct fd f = fdget(ufd); struct bpf_map *map; - void *key, *value; + void *key, *value, *ptr; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ENOENT; - rcu_read_lock(); - value = map->ops->map_lookup_elem(map, key); + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); if (!value) - goto err_unlock; + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, map->value_size) != 0) - goto err_unlock; + goto free_value; err = 0; -err_unlock: - rcu_read_unlock(); +free_value: + kfree(value); free_key: kfree(key); err_put: diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bb263d0caab3..04cfe8ace520 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) * * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self) || + if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else diff --git a/kernel/cpu.c b/kernel/cpu.c index 5d220234b3ca..1972b161c61e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,22 +58,23 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ + /* wait queue to wake up the active_writer */ + wait_queue_head_t wq; + /* verifies that no writer will get active while readers are active */ + struct mutex lock; /* * Also blocks the new readers during * an ongoing cpu hotplug operation. */ - int refcount; - /* And allows lockless put_online_cpus(). */ - atomic_t puts_pending; + atomic_t refcount; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } cpu_hotplug = { .active_writer = NULL, + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, #ifdef CONFIG_DEBUG_LOCK_ALLOC .dep_map = {.name = "cpu_hotplug.lock" }, #endif @@ -86,15 +87,6 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -static void apply_puts_pending(int max) -{ - int delta; - - if (atomic_read(&cpu_hotplug.puts_pending) >= max) { - delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); - cpu_hotplug.refcount -= delta; - } -} void get_online_cpus(void) { @@ -103,8 +95,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -116,8 +107,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); return true; } @@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus); void put_online_cpus(void) { + int refcount; + if (cpu_hotplug.active_writer == current) return; - if (!mutex_trylock(&cpu_hotplug.lock)) { - atomic_inc(&cpu_hotplug.puts_pending); - cpuhp_lock_release(); - return; - } - if (WARN_ON(!cpu_hotplug.refcount)) - cpu_hotplug.refcount++; /* try to fix things up */ + refcount = atomic_dec_return(&cpu_hotplug.refcount); + if (WARN_ON(refcount < 0)) /* try to fix things up */ + atomic_inc(&cpu_hotplug.refcount); + + if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) + wake_up(&cpu_hotplug.wq); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); cpuhp_lock_release(); } @@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus); */ void cpu_hotplug_begin(void) { - cpu_hotplug.active_writer = current; + DEFINE_WAIT(wait); + cpu_hotplug.active_writer = current; cpuhp_lock_acquire(); + for (;;) { mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(1); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); + if (likely(!atomic_read(&cpu_hotplug.refcount))) + break; mutex_unlock(&cpu_hotplug.lock); schedule(); } + finish_wait(&cpu_hotplug.wq, &wait); } void cpu_hotplug_done(void) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..07ce18ca71e0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -27,6 +27,9 @@ * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ + +#define pr_fmt(fmt) "KGDB: " fmt + #include <linux/pid_namespace.h> #include <linux/clocksource.h> #include <linux/serial_core.h> @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) return err; err = kgdb_arch_remove_breakpoint(&tmp); if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); + pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", + addr); return err; } @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", - kgdb_break[i].bpt_addr); + pr_info("BP install failed: %lx\n", + kgdb_break[i].bpt_addr); continue; } @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) continue; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", - kgdb_break[i].bpt_addr); + pr_info("BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) goto setundefined; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + pr_err("breakpoint remove failed: %lx\n", kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) if (print_wait) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); + pr_crit("waiting... or $3#33 for KDB\n"); #else - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + pr_crit("Waiting for remote debugger\n"); #endif } return 1; @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) exception_level = 0; kgdb_skipexception(ks->ex_vector, ks->linux_regs); dbg_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); + pr_crit("re-enter error: breakpoint removed %lx\n", addr); WARN_ON_ONCE(1); return 1; @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) panic("Recursive entry to debugger"); } - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + pr_crit("re-enter exception: ALL breakpoints killed\n"); #ifdef CONFIG_KGDB_KDB /* Allow kdb to debug itself one level */ return 0; @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int cpu; int trace_on = 0; int online_cpus = num_online_cpus(); + u64 time_left; kgdb_info[ks->cpu].enter_kgdb++; kgdb_info[ks->cpu].exception_state |= exception_state; @@ -595,9 +598,13 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + - atomic_read(&slaves_in_kgdb)) != online_cpus) + time_left = loops_per_jiffy * HZ; + while (kgdb_do_roundup && --time_left && + (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != + online_cpus) cpu_relax(); + if (!time_left) + pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -795,15 +802,15 @@ static struct console kgdbcons = { static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + pr_crit("ERROR: No KGDB I/O module available\n"); return; } if (!kgdb_connected) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); + pr_crit("KGDB or $3#33 for KDB\n"); #else - printk(KERN_CRIT "Entering KGDB\n"); + pr_crit("Entering KGDB\n"); #endif } @@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void) { kgdb_break_asap = 0; - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + pr_crit("Waiting for connection from remote gdb...\n"); kgdb_breakpoint(); } @@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) if (dbg_io_ops) { spin_unlock(&kgdb_registration_lock); - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); + pr_err("Another I/O driver is already registered with KGDB\n"); return -EBUSY; } @@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_dbg_io_ops->name); + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); @@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index b20d544f20c2..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) bp->bp_free = 1; - kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", - "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", - "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bp", kdb_bp, "[<vaddr>]", + "Set/Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bl", kdb_bp, "[<vaddr>]", + "Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", - "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bc", kdb_bc, "<bpnum>", - "Clear Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("be", kdb_bc, "<bpnum>", - "Enable Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bd", kdb_bc, "<bpnum>", - "Disable Breakpoint", 0, KDB_REPEAT_NONE); - - kdb_register_repeat("ss", kdb_ss, "", - "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bph", kdb_bp, "[<vaddr>]", + "[datar [length]|dataw [length]] Set hw brk", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bc", kdb_bc, "<bpnum>", + "Clear Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("be", kdb_bc, "<bpnum>", + "Enable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("bd", kdb_bc, "<bpnum>", + "Disable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + + kdb_register_flags("ss", kdb_ss, "", + "Single Step", 1, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } + /* set CATASTROPHIC if the system contains unresponsive processors */ + for_each_online_cpu(i) + if (!kgdb_info[i].enter_kgdb) + KDB_FLAG_SET(CATASTROPHIC); if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..7b40c5f07dce 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -12,6 +12,7 @@ */ #include <linux/ctype.h> +#include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kmsg_dump.h> @@ -23,6 +24,7 @@ #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -42,6 +44,12 @@ #include <linux/slab.h> #include "kdb_private.h" +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "kdb." + +static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; +module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); + #define GREP_LEN 256 char kdb_grep_string[GREP_LEN]; int kdb_grepping_flag; @@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(BADLENGTH, "Invalid length field"), KDBMSG(NOBP, "No Breakpoint exists"), KDBMSG(BADADDR, "Invalid address"), + KDBMSG(NOPERM, "Permission denied"), }; #undef KDBMSG @@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu) } /* + * Check whether the flags of the current command and the permissions + * of the kdb console has allow a command to be run. + */ +static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, + bool no_args) +{ + /* permissions comes from userspace so needs massaging slightly */ + permissions &= KDB_ENABLE_MASK; + permissions |= KDB_ENABLE_ALWAYS_SAFE; + + /* some commands change group when launched with no arguments */ + if (no_args) + permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; + + flags |= KDB_ENABLE_ALL; + + return permissions & flags; +} + +/* * kdbgetenv - This function will return the character string value of * an environment variable. * Parameters: @@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, kdb_symtab_t symtab; /* + * If the enable flags prohibit both arbitrary memory access + * and flow control then there are no reasonable grounds to + * provide symbol lookup. + */ + if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, + kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Process arguments which follow the following syntax: * * symbol | numeric-address [+/- numeric-offset] @@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = 0; if (s->usable) - kdb_register(s->name, kdb_exec_defcmd, - s->usage, s->help, 0); + /* macros are always safe because when executed each + * internal command re-enters kdb_parse() and is + * safety checked individually. + */ + kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, + s->help, 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } if (!s->usable) @@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr) if (i < kdb_max_commands) { int result; + + if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + return KDB_NOPERM; + KDB_STATE_SET(CMD); result = (*tp->cmd_func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - switch (tp->cmd_repeat) { - case KDB_REPEAT_NONE: - argc = 0; - if (argv[0]) - *(argv[0]) = '\0'; - break; - case KDB_REPEAT_NO_ARGS: - argc = 1; - if (argv[1]) - *(argv[1]) = '\0'; - break; - case KDB_REPEAT_WITH_ARGS: - break; - } + + if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + return result; + + argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + if (argv[argc]) + *(argv[argc]) = '\0'; return result; } @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) */ static int kdb_sr(int argc, const char **argv) { + bool check_mask = + !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); + if (argc != 1) return KDB_ARGCOUNT; + kdb_trap_printk++; - __handle_sysrq(*argv[1], false); + __handle_sysrq(*argv[1], check_mask); kdb_trap_printk--; return 0; @@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); + kdb_printf("%4d ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) { state = 'F'; /* cpu is offline */ + } else if (!kgdb_info[i].enter_kgdb) { + state = 'D'; /* cpu is online but unresponsive */ } else { state = ' '; /* cpu is responding to kdb */ if (kdb_task_state_char(KDB_TSK(i)) == 'I') @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) return 0; if (!kt->cmd_name) continue; + if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + continue; if (strlen(kt->cmd_usage) > 20) space = "\n "; kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) } /* - * kdb_register_repeat - This function is used to register a kernel + * kdb_register_flags - This function is used to register a kernel * debugger command. * Inputs: * cmd Command name @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) * zero for success, one if a duplicate command. */ #define kdb_command_extend 50 /* arbitrary */ -int kdb_register_repeat(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_repeat_t repeat) +int kdb_register_flags(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_cmdflags_t flags) { int i; kdbtab_t *kp; @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, kp->cmd_func = func; kp->cmd_usage = usage; kp->cmd_help = help; - kp->cmd_flags = 0; kp->cmd_minlen = minlen; - kp->cmd_repeat = repeat; + kp->cmd_flags = flags; return 0; } -EXPORT_SYMBOL_GPL(kdb_register_repeat); +EXPORT_SYMBOL_GPL(kdb_register_flags); /* * kdb_register - Compatibility register function for commands that do * not need to specify a repeat state. Equivalent to - * kdb_register_repeat with KDB_REPEAT_NONE. + * kdb_register_flags with flags set to 0. * Inputs: * cmd Command name * func Function to execute the command @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, char *help, short minlen) { - return kdb_register_repeat(cmd, func, usage, help, minlen, - KDB_REPEAT_NONE); + return kdb_register_flags(cmd, func, usage, help, minlen, 0); } EXPORT_SYMBOL_GPL(kdb_register); @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) for_each_kdbcmd(kp, i) kp->cmd_name = NULL; - kdb_register_repeat("md", kdb_md, "<vaddr>", + kdb_register_flags("md", kdb_md, "<vaddr>", "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", - "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", - "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mds", kdb_md, "<vaddr>", - "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", - "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("go", kdb_go, "[<vaddr>]", - "Continue Execution", 1, KDB_REPEAT_NONE); - kdb_register_repeat("rd", kdb_rd, "", - "Display Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", - "Modify Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ef", kdb_ef, "<vaddr>", - "Display exception frame", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", - "Stack traceback", 1, KDB_REPEAT_NONE); - kdb_register_repeat("btp", kdb_bt, "<pid>", - "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", - "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btt", kdb_bt, "<vaddr>", + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", + "Display Raw Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", + "Display Physical Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mds", kdb_md, "<vaddr>", + "Display Memory Symbolically", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", + "Modify Memory Contents", 0, + KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); + kdb_register_flags("go", kdb_go, "[<vaddr>]", + "Continue Execution", 1, + KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("rd", kdb_rd, "", + "Display Registers", 0, + KDB_ENABLE_REG_READ); + kdb_register_flags("rm", kdb_rm, "<reg> <contents>", + "Modify Registers", 0, + KDB_ENABLE_REG_WRITE); + kdb_register_flags("ef", kdb_ef, "<vaddr>", + "Display exception frame", 0, + KDB_ENABLE_MEM_READ); + kdb_register_flags("bt", kdb_bt, "[<vaddr>]", + "Stack traceback", 1, + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("btp", kdb_bt, "<pid>", + "Display stack for process <pid>", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, - KDB_REPEAT_NONE); - kdb_register_repeat("env", kdb_env, "", - "Show environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("set", kdb_set, "", - "Set environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("help", kdb_help, "", - "Display Help Message", 1, KDB_REPEAT_NONE); - kdb_register_repeat("?", kdb_help, "", - "Display Help Message", 0, KDB_REPEAT_NONE); - kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", - "Switch to new cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", - "Display active task list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("pid", kdb_pid, "<pidnum>", - "Switch to another task", 0, KDB_REPEAT_NONE); - kdb_register_repeat("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, KDB_REPEAT_NONE); + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("env", kdb_env, "", + "Show environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("set", kdb_set, "", + "Set environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("help", kdb_help, "", + "Display Help Message", 1, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("?", kdb_help, "", + "Display Help Message", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("cpu", kdb_cpu, "<cpunum>", + "Switch to new cpu", 0, + KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, 0); + kdb_register_flags("ps", kdb_ps, "[<flags>|A]", + "Display active task list", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("pid", kdb_pid, "<pidnum>", + "Switch to another task", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, + KDB_ENABLE_REBOOT); #if defined(CONFIG_MODULES) - kdb_register_repeat("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, KDB_REPEAT_NONE); + kdb_register_flags("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, + KDB_ENABLE_INSPECT); #endif #if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_repeat("sr", kdb_sr, "<key>", - "Magic SysRq key", 0, KDB_REPEAT_NONE); + kdb_register_flags("sr", kdb_sr, "<key>", + "Magic SysRq key", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif #if defined(CONFIG_PRINTK) - kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, KDB_REPEAT_NONE); + kdb_register_flags("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif if (arch_kgdb_ops.enable_nmi) { - kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", - "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); - } - kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", - "Send a signal to a process", 0, KDB_REPEAT_NONE); - kdb_register_repeat("summary", kdb_summary, "", - "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", - "Display per_cpu variables", 3, KDB_REPEAT_NONE); - kdb_register_repeat("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, KDB_REPEAT_NONE); + kdb_register_flags("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, + KDB_ENABLE_ALWAYS_SAFE); + } + kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", + "Send a signal to a process", 0, + KDB_ENABLE_SIGNAL); + kdb_register_flags("summary", kdb_summary, "", + "Summarize the system", 4, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", + "Display per_cpu variables", 3, + KDB_ENABLE_MEM_READ); + kdb_register_flags("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, + KDB_ENABLE_ALWAYS_SAFE); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..eaacd1693954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -172,10 +172,9 @@ typedef struct _kdbtab { kdb_func_t cmd_func; /* Function to execute command */ char *cmd_usage; /* Usage String for this command */ char *cmd_help; /* Help message for this command */ - short cmd_flags; /* Parsing flags */ short cmd_minlen; /* Minimum legal # command * chars required */ - kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ } kdbtab_t; extern int kdb_bt(int, const char **); /* KDB display back trace */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c1ee7f2bebc..7f2fbb8b5069 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, rotation_list); +static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. + * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and + * perf_event_task_tick() are fully serialized because they're strictly cpu + * affine and perf_event_ctx{activate,deactivate} are called with IRQs + * disabled, while perf_event_task_tick is called from IRQ context. */ -static void perf_pmu_rotate_start(struct pmu *pmu) +static void perf_event_ctx_activate(struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = this_cpu_ptr(&rotation_list); + struct list_head *head = this_cpu_ptr(&active_ctx_list); WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); + WARN_ON(!list_empty(&ctx->active_ctx_list)); + + list_add(&ctx->active_ctx_list, head); +} + +static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +{ + WARN_ON(!irqs_disabled()); + + WARN_ON(list_empty(&ctx->active_ctx_list)); + + list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) @@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx) } /* + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and + * perf_pmu_migrate_context() we need some magic. + * + * Those places that change perf_event::ctx will hold both + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. + * + * Lock ordering is by mutex address. There is one other site where + * perf_event_context::mutex nests and that is put_event(). But remember that + * that is a parent<->child context relation, and migration does not affect + * children, therefore these two orderings should not interact. + * + * The change in perf_event::ctx does not affect children (as claimed above) + * because the sys_perf_event_open() case will install a new event and break + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only + * concerned with cpuctx and that doesn't have children. + * + * The places that change perf_event::ctx will issue: + * + * perf_remove_from_context(); + * synchronize_rcu(); + * perf_install_in_context(); + * + * to affect the change. The remove_from_context() + synchronize_rcu() should + * quiesce the event, after which we can install it in the new location. This + * means that only external vectors (perf_fops, prctl) can perturb the event + * while in transit. Therefore all such accessors should also acquire + * perf_event_context::mutex to serialize against this. + * + * However; because event->ctx can change while we're waiting to acquire + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() + * function. + * + * Lock order: + * task_struct::perf_event_mutex + * perf_event_context::mutex + * perf_event_context::lock + * perf_event::child_mutex; + * perf_event::mmap_mutex + * mmap_sem + */ +static struct perf_event_context * +perf_event_ctx_lock_nested(struct perf_event *event, int nesting) +{ + struct perf_event_context *ctx; + +again: + rcu_read_lock(); + ctx = ACCESS_ONCE(event->ctx); + if (!atomic_inc_not_zero(&ctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_nested(&ctx->mutex, nesting); + if (event->ctx != ctx) { + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + + return ctx; +} + +static inline struct perf_event_context * +perf_event_ctx_lock(struct perf_event *event) +{ + return perf_event_ctx_lock_nested(event, 0); +} + +static void perf_event_ctx_unlock(struct perf_event *event, + struct perf_event_context *ctx) +{ + mutex_unlock(&ctx->mutex); + put_ctx(ctx); +} + +/* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. @@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_branch_stack++; list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event) if (group_leader == event) return; + WARN_ON_ONCE(group_leader->ctx != event->ctx); + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && !is_software_event(event)) group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; @@ -1296,6 +1384,10 @@ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * We can have double detach due to exit/hot-unplug + close. */ @@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; + + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: @@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event, { u64 tstamp = perf_event_time(event); u64 delta; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * An event which could not be activated because of * filter mismatch still needs to have its timings @@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; - ctx->nr_active--; + if (!--ctx->nr_active) + perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) @@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -void perf_event_disable(struct perf_event *event) +static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1695,6 +1794,19 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } + +/* + * Strictly speaking kernel users cannot create groups and therefore this + * interface does not need the perf_event_ctx_lock() magic. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_disable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, @@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; - ctx->nr_active++; + if (!ctx->nr_active++) + perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; @@ -2158,7 +2271,7 @@ unlock: * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -void perf_event_enable(struct perf_event *event) +static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -2214,9 +2327,21 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } + +/* + * See perf_event_disable(); + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_enable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_enable); -int perf_event_refresh(struct perf_event *event, int refresh) +static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh) return -EINVAL; atomic_add(refresh, &event->event_limit); - perf_event_enable(event); + _perf_event_enable(event); return 0; } + +/* + * See perf_event_disable() + */ +int perf_event_refresh(struct perf_event *event, int refresh) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_refresh(event, refresh); + perf_event_ctx_unlock(event, ctx); + + return ret; +} EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, @@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); } /* @@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx) list_rotate_left(&ctx->flexible_groups); } -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0; if (cpuctx->ctx.nr_events) { - remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } @@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: - if (remove) - list_del_init(&cpuctx->rotation_list); return rotate; } @@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; + struct list_head *head = this_cpu_ptr(&active_ctx_list); + struct perf_event_context *ctx, *tmp; int throttled; WARN_ON(!irqs_disabled()); @@ -2976,14 +3100,8 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; + list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - } } static int event_enable_on_exec(struct perf_event *event, @@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); @@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event) rcu_read_unlock(); if (owner) { - mutex_lock(&owner->perf_event_mutex); + /* + * If we're here through perf_event_exit_task() we're already + * holding ctx->mutex which would be an inversion wrt. the + * normal lock order. + * + * However we can safely take this lock because its the child + * ctx->mutex. + */ + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); + /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex @@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event) */ static void put_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; + struct perf_event_context *ctx; if (!atomic_long_dec_and_test(&event->refcount)) return; @@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event) if (!is_kernel_event(event)) perf_remove_from_owner(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: * @@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event) * the last filedesc died, so there is no possibility * to trigger the AB-BA case. */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); @@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; struct perf_event_context *ctx = leader->ctx; - u64 values[5]; + int n = 0, size = 0, ret; u64 count, enabled, running; + u64 values[5]; + + lockdep_assert_held(&ctx->mutex); - mutex_lock(&ctx->mutex); count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; @@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - goto unlock; + return -EFAULT; ret = size; @@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; + return -EFAULT; } ret += size; } -unlock: - mutex_unlock(&ctx->mutex); return ret; } @@ -3660,8 +3786,14 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + int ret; - return perf_read_hw(event, buf, count); + ctx = perf_event_ctx_lock(event); + ret = perf_read_hw(event, buf, count); + perf_event_ctx_unlock(event, ctx); + + return ret; } static unsigned int perf_poll(struct file *file, poll_table *wait) @@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } -static void perf_event_reset(struct perf_event *event) +static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); local64_set(&event->count, 0); @@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event, struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) @@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event, struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + lockdep_assert_held(&ctx->mutex); + event = event->group_leader; perf_event_for_each_child(event, func); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); - mutex_unlock(&ctx->mutex); } static int perf_event_period(struct perf_event *event, u64 __user *arg) @@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { - struct perf_event *event = file->private_data; void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; + func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; + func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: - func = perf_event_reset; + func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); + return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); @@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + long ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_ioctl(event, cmd, arg); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, int perf_event_task_enable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_enable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3895,11 +4043,15 @@ int perf_event_task_enable(void) int perf_event_task_disable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_disable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -4461,18 +4613,14 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs) + struct pt_regs *regs, + struct pt_regs *regs_user_copy) { - if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - regs_user->abi = perf_reg_abi(current); + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; + } else if (current->mm) { + perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -4951,7 +5099,8 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs); + perf_sample_regs_user(&data->regs_user, regs, + &data->regs_user_copy); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ @@ -5892,6 +6041,8 @@ end: rcu_read_unlock(); } +DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); + int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); @@ -5907,21 +6058,30 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; - int rctx; - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) + if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); +} + +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (unlikely(rctx < 0)) + goto fail; + + ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); +fail: preempt_enable_notrace(); } @@ -6779,12 +6939,10 @@ skip_type: __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; __perf_cpu_hrtimer_init(cpuctx, cpu); - INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -6857,6 +7015,20 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) +{ + int ret; + + if (!try_module_get(pmu->module)) + return -ENODEV; + event->pmu = pmu; + ret = pmu->event_init(event); + if (ret) + module_put(pmu->module); + + return ret; +} + struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; @@ -6869,24 +7041,14 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; @@ -7250,6 +7412,15 @@ out: return ret; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7265,7 +7436,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -7423,7 +7594,19 @@ SYSCALL_DEFINE5(perf_event_open, * task or CPU context: */ if (move_group) { - if (group_leader->ctx->type != ctx->type) + /* + * Make sure we're both on the same task, or both + * per-cpu events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) goto err_context; } else { if (group_leader->ctx != ctx) @@ -7451,43 +7634,68 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader, false); + gctx = group_leader->ctx; /* - * Removing from the context ends up with disabled - * event. What we want here is event in the initial - * startup state, ready to be add into new context. + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. */ - perf_event__state_init(group_leader); + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + perf_remove_from_context(group_leader, false); + list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling, false); - perf_event__state_init(sibling); put_ctx(gctx); } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); + } else { + mutex_lock(&ctx->mutex); } WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); if (move_group) { + /* + * Wait for everybody to stop referencing the events through + * the old lists, before installing it on new lists. + */ synchronize_rcu(); - perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); + + /* + * Install the group siblings before the group leader. + * + * Because a group leader will try and install the entire group + * (through the sibling list, which is still in-tact), we can + * end up with siblings installed in the wrong context. + * + * By installing siblings first we NO-OP because they're not + * reachable through the group lists. + */ list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { + perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); + perf_install_in_context(ctx, group_leader, group_leader->cpu); + get_ctx(ctx); } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); + + if (move_group) { + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -7595,7 +7803,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; - mutex_lock(&src_ctx->mutex); + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event, false); @@ -7603,11 +7815,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) put_ctx(src_ctx); list_add(&event->migrate_entry, &events); } - mutex_unlock(&src_ctx->mutex); + /* + * Wait for the events to quiesce before re-instating them. + */ synchronize_rcu(); - mutex_lock(&dst_ctx->mutex); + /* + * Re-instate events in 2 passes. + * + * Skip over group leaders and only install siblings on this first + * pass, siblings will not get enabled without a leader, however a + * leader will enable its siblings, even if those are still on the old + * context. + */ + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + if (event->group_leader == event) + continue; + + list_del(&event->migrate_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + + /* + * Once all the siblings are setup properly, install the group leaders + * to make it go. + */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) @@ -7617,6 +7854,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) get_ctx(dst_ctx); } mutex_unlock(&dst_ctx->mutex); + mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); @@ -7803,14 +8041,19 @@ static void perf_free_event(struct perf_event *event, put_event(parent); + raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* - * free an unexposed, unused context as created by inheritance by + * Free an unexposed, unused context as created by inheritance by * perf_event_init_task below, used by fork() in case of fail. + * + * Not all locks are strictly required, but take them anyway to be nice and + * help out with the lockdep assertions. */ void perf_event_free_task(struct task_struct *task) { @@ -8129,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); } } @@ -8150,22 +8393,11 @@ static void perf_event_init_cpu(int cpu) } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; - perf_pmu_rotate_stop(ctx->pmu); - rcu_read_lock(); list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) __perf_remove_from_context(&re); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 146a5792b1d2..eadb95ce7aac 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -13,12 +13,13 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> +#include <linux/poll.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLL_IN); + atomic_set(&handle->rb->poll, POLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); diff --git a/kernel/exit.c b/kernel/exit.c index 1ea4369890a3..6806c55475ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); int ret; - if (unlikely(p->exit_state == EXIT_DEAD)) + if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, p); @@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (unlikely(p->exit_state == EXIT_TRACE)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. @@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* diff --git a/kernel/futex.c b/kernel/futex.c index 63678b573d61..4eeb63de7e54 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - return futex_lock_pi(uaddr, flags, val, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, 0, timeout, 1); + return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 06f58309fed2..ee619929cf90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) static void free_insn_page(void *page) { - module_free(NULL, page); + module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..4ca8eb151975 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 4d60986fcbee..d1fe2ba5bac9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } -/* - * Cancellable version of the MCS lock above. - * - * Intended for adaptive spinning of sleeping locks: - * mutex_lock()/rwsem_down_{read,write}() etc. - */ - -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # value */ -}; - -extern bool osq_lock(struct optimistic_spin_queue *lock); -extern void osq_unlock(struct optimistic_spin_queue *lock); - #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); } /* * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug * mutexes so that we can do it here after we've verified state. */ + mutex_clear_owner(lock); atomic_set(&lock->count, 1); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 454195194d4a..94674e5919cb 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); * The mutex must later on be released by the same task that * acquired it. Recursive locking is not allowed. The task * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with + * memory where the mutex resides must not be freed with * the mutex still locked. The mutex must first be initialized * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. @@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, } /* - * after acquiring lock with fastpath or when we lost out in contested + * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. * * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, @@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, spin_unlock_mutex(&lock->base.wait_lock, flags); } - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. + * After acquiring lock in the slowpath set ctx and wake up any + * waiters so they can recheck. * + * Callers must hold the mutex wait_lock. */ +static __always_inline void +ww_mutex_set_context_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + struct mutex_waiter *cur; -/* - * Mutex spinning code migrated from kernel/sched/core.c - */ + ww_mutex_lock_acquired(lock, ctx); + lock->ctx = ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } +} +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { if (lock->owner != owner) @@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock, if (!mutex_can_spin_on_owner(lock)) goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ if (!osq_lock(&lock->osq)) goto done; @@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); @@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; } @@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + __set_task_state(task, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current_thread_info()); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -582,23 +602,7 @@ skip_wait: if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + ww_mutex_set_context_slowpath(ww, ww_ctx); } spin_unlock_mutex(&lock->wait_lock, flags); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c index 9887a905a762..c112d00341b0 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/osq_lock.c @@ -1,8 +1,6 @@ #include <linux/percpu.h> #include <linux/sched.h> -#include "mcs_spinlock.h" - -#ifdef CONFIG_SMP +#include <linux/osq_lock.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!smp_load_acquire(&node->locked)) { + while (!ACCESS_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ @@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock) if (next) ACCESS_ONCE(next->locked) = 1; } - -#endif - diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7c98873a3077..3059bc2f022d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); } + __set_current_state(TASK_RUNNING); return ret; } @@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) { remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); @@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) remove_waiter(lock, waiter); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2c93571162cb..2555ae15ec14 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); } - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); out: ; } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7628c3fc37ca..2f7cc4076f50 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) schedule(); } - tsk->state = TASK_RUNNING; - + __set_task_state(tsk, TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); diff --git a/kernel/module.c b/kernel/module.c index 3965511ae133..d856e96a3cce 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced) return 0; } -unsigned long module_refcount(struct module *mod) +/** + * module_refcount - return the refcount or -1 if unloading + * + * @mod: the module we're checking + * + * Returns: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) { - return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %lu ", module_refcount(mod)); + seq_printf(m, " %i ", module_refcount(mod)); /* * Always include a trailing , so userspace can differentiate @@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } static struct module_attribute modinfo_refcnt = @@ -1795,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif -void __weak module_free(struct module *mod, void *module_region) +void __weak module_memfree(void *module_region) { vfree(module_region); } @@ -1804,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod) { } +void __weak module_arch_freeing_init(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1841,7 +1854,8 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1850,7 +1864,7 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); + module_memfree(mod->module_core); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2785,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_free(mod, mod->module_core); + module_memfree(mod->module_core); return -ENOMEM; } memset(ptr, 0, mod->init_size); @@ -2930,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); + module_memfree(mod->module_core); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2983,10 +2998,31 @@ static void do_mod_ctors(struct module *mod) #endif } +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct rcu_head rcu; + void *module_init; +}; + +static void do_free_init(struct rcu_head *head) +{ + struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); + module_memfree(m->module_init); + kfree(m); +} + /* This is where the real work happens */ static int do_init_module(struct module *mod) { int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->module_init; /* * We want to find out whether @mod uses async during init. Clear @@ -2999,18 +3035,7 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* - * Init routine failed: abort. Try to protect us from - * buggy refcounters. - */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; + goto fail_free_freeinit; } if (ret > 0) { pr_warn("%s: '%s'->init suspiciously returned %d, it should " @@ -3055,15 +3080,35 @@ static int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; mod->init_text_size = 0; + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, + * we call synchronize_rcu/synchronize_sched, but we don't want + * to slow down the success path, so use actual RCU here. + */ + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; } static int may_init_module(void) diff --git a/kernel/notifier.c b/kernel/notifier.c index 4803da6eab62..ae9fc7cc360e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); +#endif /* CONFIG_SRCU */ + static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/params.c b/kernel/params.c index 0af9b2c4e56c..728e05b167de 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -642,12 +642,15 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ + memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + else + mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 48b28d387c7f..7e01f78f0417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -251,6 +251,7 @@ config APM_EMULATION config PM_OPP bool + select SRCU ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/range.c b/kernel/range.c index 322ea8e93e4b..82cfc285b046 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; const struct range *r2 = x2; - s64 start1, start2; - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; + if (r1->start < r2->start) + return -1; + if (r1->start > r2->start) + return 1; + return 0; } int clean_sort_range(struct range *range, int az) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index e6fae503d1bc..50a808424b06 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,5 @@ -obj-y += update.o srcu.o +obj-y += update.o +obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 07bb02eda844..80adef7d4c3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void); void rcu_early_boot_tests(void); +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4d559baf06e0..30d42aa55d83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,7 +244,8 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - int (*completed)(void); + unsigned long (*started)(void); + unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } -static int rcu_no_completed(void) +static unsigned long rcu_no_completed(void) { return 0; } @@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, + .started = rcu_batches_started, + .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - static void rcu_bh_torture_deferred_free(struct rcu_torture *p) { call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); @@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) srcu_read_unlock(&srcu_ctl, idx); } -static int srcu_torture_completed(void) +static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(&srcu_ctl); } @@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .started = NULL, .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, @@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; - int completed; - int completed_end; + unsigned long started; + unsigned long completed; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused) unsigned long long ts; idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); + started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { - int completed; - int completed_end; + unsigned long started; + unsigned long completed; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; @@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); + ts, started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg) cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; + pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", + atomic_read(&barrier_cbs_invoked), + n_barrier_cbs); WARN_ON_ONCE(1); } n_barrier_successes++; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index e037f3eb2f7b..445bf8ffe3fb 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649f8817..cc9ceca7bde1 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(); /* implies rcu_bh_inc() */ - barrier(); - rcu_dynticks_nesting = newval; -} - /* * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). + * entered that mode. */ void rcu_idle_enter(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_irq_exit(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit idle, so that we are no longer in an extended quiescent state. */ void rcu_idle_exit(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_irq_enter(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_enter); @@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); */ bool notrace __rcu_is_watching(void) { - return rcu_dynticks_nesting; + return true; } EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - -/* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). @@ -250,7 +150,7 @@ void rcu_bh_qs(void) void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) + if (user) rcu_sched_qs(); else if (!in_softirq()) rcu_bh_qs(); @@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head, rcp->curtail = &head->next; RCU_TRACE(rcp->qlen++); local_irq_restore(flags); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_sched_qs() */ + resched_cpu(0); + } } /* @@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c56569127..f94e209a10d6 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->ticks_this_gp++; j = jiffies; js = ACCESS_ONCE(rcp->jiffies_stall); - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) + } else if (ULONG_CMP_GE(j, js)) { ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + } } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7680fc275036..48d640ca1a05 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -284,6 +291,22 @@ void rcu_note_context_switch(void) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiesecent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desparate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU-sched batches processed thus far for debug & stats. + * Return the number of RCU batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started(void) +{ + return rcu_state_p->gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started); + +/* + * Return the number of RCU-sched batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_sched(void) +{ + return rcu_sched_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_sched); + +/* + * Return the number of RCU BH batches started thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_started_bh(void) +{ + return rcu_bh_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_bh); + +/* + * Return the number of RCU batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed(void) +{ + return rcu_state_p->completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU-sched batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* - * Return the number of RCU BH batches processed thus far for debug & stats. + * Return the number of RCU BH batches completed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } @@ -759,39 +818,71 @@ void rcu_irq_enter(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) */ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + smp_mb__before_atomic(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* atomic_inc() before later RCU read-side crit sects */ + smp_mb__after_atomic(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + incby = 1; + } + rdtp->dynticks_nmi_nesting += incby; + barrier(); } /** * rcu_nmi_exit - inform RCU of exit from NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. */ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + rdtp->dynticks_nmi_nesting -= 2; return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + rdtp->dynticks_nmi_nesting = 0; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } /* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); +} + +/* + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +{ + unsigned long gpa; + unsigned long j; + + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + if (j - gpa > 2 * HZ) + pr_err("%s kthread starved for %ld jiffies!\n", + rsp->name, j - gpa); } /* @@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_check_gp_kthread_starvation(rsp); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); + + rcu_check_gp_kthread_starvation(rsp); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); @@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } mutex_unlock(&rsp->onoff_mutex); @@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg) ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) } /* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + +/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinit &= ~rdp->grpmask; + if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } +static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp, } raw_spin_unlock_irqrestore(&rnp->lock, flags); } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } } /* @@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) * Schedule RCU callback invocation. If the specified type of RCU * does not support RCU priority boosting, just do a direct call, * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the + * are running on the current CPU with softirqs disabled, the * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp) } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); + smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); @@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = 0; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } @@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; + int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct rcu_state *rsp; + struct sched_param sp; struct task_struct *t; + /* Force priority into range. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + if (kthread_prio != kthread_prio_in) + pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", + kthread_prio, kthread_prio_in); + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); } rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..119de399eb2f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> -#include <linux/irq_work.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -172,11 +171,6 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ - struct completion boost_completion; - /* Used to ensure that the rt_mutex used */ - /* to carry out the boosting is fully */ - /* released with no future boostee accesses */ - /* before that rt_mutex is re-initialized. */ struct rt_mutex boost_mtx; /* Used only for the priority-boosting */ /* side effect, not as a lock. */ @@ -257,9 +251,12 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -340,14 +337,10 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ - atomic_long_t nocb_follower_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -356,8 +349,6 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - long nocb_gp_count; - long nocb_gp_count_lazy; bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ @@ -488,10 +479,14 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + - rdp->nocb_p_count + - atomic_long_read(&rdp->nocb_follower_count) + - rdp->nocb_p_count + rdp->nocb_gp_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + - rdp->nocb_p_count_lazy + - atomic_long_read(&rdp->nocb_follower_count_lazy) + - rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..2e850a51bb8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -34,10 +34,6 @@ #include "../locking/rtmutex_common.h" -/* rcuc/rcub kthread realtime priority */ -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -module_param(kthread_prio, int, 0644); - /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. @@ -103,6 +99,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -114,25 +112,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -static long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked @@ -307,15 +286,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, } /* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; - int empty_exp; - int empty_exp_now; + bool empty; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty = !rcu_preempt_has_tasks(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t) #endif /* #ifdef CONFIG_RCU_BOOST */ /* + * If this was the last task on the list, go see if we + * need to propagate ->qsmaskinit bit clearing up the + * rcu_node tree. + */ + if (!empty && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + + /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, @@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rnp->exp_tasks = rnp->blkd_tasks.next; @@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ @@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + return false; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). @@ -1080,7 +951,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; @@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } @@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } static void rcu_prepare_kthreads(int cpu) @@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); @@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } @@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ - /* No-CBs CPUs might have callbacks on any of three lists. */ + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); + +#ifdef CONFIG_PROVE_RCU rhp = ACCESS_ONCE(rdp->nocb_head); if (!rhp) rhp = ACCESS_ONCE(rdp->nocb_gp_head); @@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) cpu, rhp->func); WARN_ON_ONCE(1); } +#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!rhp; + return !!ret; } /* @@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ @@ -2288,9 +2169,6 @@ wait_again: /* Move callbacks to wait-for-GP list, which is empty. */ ACCESS_ONCE(rdp->nocb_head) = NULL; rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); - rdp->nocb_gp_count_lazy = - atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); gotcbs = true; } @@ -2338,9 +2216,6 @@ wait_again: /* Append callbacks to follower's "done" list. */ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); *tail = rdp->nocb_gp_head; - atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); - atomic_long_add(rdp->nocb_gp_count_lazy, - &rdp->nocb_follower_count_lazy); smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* @@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); ACCESS_ONCE(rdp->nocb_follower_head) = NULL; tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); - c = atomic_long_xchg(&rdp->nocb_follower_count, 0); - cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); - rdp->nocb_p_count += c; - rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) = - rdp->nocb_p_count_lazy - cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e1beeb..fbb6240509ea 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 607f852b4d04..7052d3fd4e7b 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!ACCESS_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; @@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + return !!ACCESS_ONCE(x->done); } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5797b78add6..1f37fe7f77a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update > 0) + lockdep_assert_held(&rq->lock); + + if (rq->clock_skip_update & RQCF_ACT_SKIP) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -490,6 +492,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP @@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); } __set_task_cpu(p, new_cpu); @@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } /* @@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + init_dl_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2748,6 +2765,10 @@ again: * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space + * + * WARNING: all callers must re-check need_resched() afterward and reschedule + * accordingly in case an event triggered the need for rescheduling (such as + * an interrupt waking up a task) while preemption was disabled in __schedule(). */ static void __sched __schedule(void) { @@ -2756,7 +2777,6 @@ static void __sched __schedule(void) struct rq *rq; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -2776,6 +2796,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2800,13 +2822,13 @@ need_resched: switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev)) update_rq_clock(rq); next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->skip_clock_update = 0; + rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -2821,8 +2843,6 @@ need_resched: post_schedule(rq); sched_preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) @@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void) struct task_struct *tsk = current; sched_submit_work(tsk); - __schedule(); + do { + __schedule(); + } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +static void preempt_schedule_common(void) +{ + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; - do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); + preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); @@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } /* @@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p) return match; } +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) @@ -3510,7 +3567,7 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; - if (dl_policy(policy)) + if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; @@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static void __cond_resched(void) -{ - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); -} - int __sched _cond_resched(void) { if (should_resched()) { - __cond_resched(); + preempt_schedule_common(); return 1; } return 0; @@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) - __cond_resched(); + preempt_schedule_common(); else cpu_relax(); ret = 1; @@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void) if (should_resched()) { local_bh_enable(); - __cond_resched(); + preempt_schedule_common(); local_bh_disable(); return 1; } @@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned state; + unsigned long state = p->state; - state = p->state ? __ffs(p->state) + 1 : 0; + if (state) + state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 @@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, struct dl_bw *cur_dl_b; unsigned long flags; + if (!cpumask_weight(cur)) + return ret; + rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); trial_cpus = cpumask_weight(trial); @@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -7113,9 +7167,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -7135,13 +7186,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -7253,6 +7304,11 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7262,11 +7318,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ @@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset) * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - if (WARN_ONCE(current->state != TASK_RUNNING, + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " "state=%lx set at [<%p>] %pS\n", current->state, (void *)current->task_state_change, - (void *)current->task_state_change)) - __set_current_state(TASK_RUNNING); + (void *)current->task_state_change); ___might_sleep(file, line, preempt_offset); } @@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) in_atomic(), irqs_disabled(), current->pid, current->comm); + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 539ca3ce071b..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { + if (later_mask && + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && @@ -186,6 +187,26 @@ out: } /* + * cpudl_set_freecpu - Set the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_set_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_set_cpu(cpu, cp->free_cpus); +} + +/* + * cpudl_clear_freecpu - Clear the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_clear_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_clear_cpu(cpu, cp->free_cpus); +} + +/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ @@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp) if (!cp->elements) return -ENOMEM; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { kfree(cp->elements); return -ENOMEM; } @@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp) for_each_possible_cpu(i) cp->elements[i].idx = IDX_INVALID; - cpumask_setall(cp->free_cpus); - return 0; } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 020039bd1326..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e5db8c6feebd..a027799ae130 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -536,23 +541,19 @@ again: sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (task_on_rq_queued(p)) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); #endif - } unlock: raw_spin_unlock(&rq->lock); @@ -570,24 +571,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -630,10 +614,9 @@ static void update_curr_dl(struct rq *rq) dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } @@ -870,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1090,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + /* + * Even when we have runtime, update_curr_dl() might have resulted in us + * not being the leftmost task anymore. In that case NEED_RESCHED will + * be set and schedule() will start a new hrtick for the next task. + */ + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } @@ -1111,6 +1100,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1182,9 +1172,6 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - cpumask_copy(later_mask, task_rq(task)->rd->span); - cpumask_and(later_mask, later_mask, cpu_active_mask); - cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1579,6 +1566,7 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } @@ -1590,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq) dl_clear_overload(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } void init_sched_dl_class(void) @@ -1631,8 +1620,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p) static void switched_from_dl(struct rq *rq, struct task_struct *p) { + /* XXX we should retain the bw until 0-lag */ cancel_dl_timer(rq, p); - __dl_clear_params(p); /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92cc52001e74..8baaf858d25c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -305,6 +305,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); + PN(clock_task); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df2cdf77f899..7ce18f3c097a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; p->se.avg.runnable_avg_sum = slice; p->se.avg.runnable_avg_period = slice; @@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) nodes = node_online_map; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; - nodemask_t max_group; + nodemask_t max_group = NODE_MASK_NONE; int a, b; /* Are there nodes at this distance from each other? */ @@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; return decays; } @@ -4005,6 +4004,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } @@ -4424,7 +4427,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; @@ -5153,7 +5156,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } set_skip_buddy(se); @@ -5945,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e666..aaf1c1d5cf5d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!tif_need_resched() && + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ee15f5a0d1c1..f4d4b077eba0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled check_preempt_curr() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_skip_update(rq, false); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); @@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a2a45c970e7..0870db23d79c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -558,8 +558,6 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif - int skip_clock_update; - /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -588,6 +586,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; + unsigned int clock_skip_update; u64 clock; u64 clock_task; @@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} + static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_skip_update |= RQCF_REQ_SKIP; + else + rq->clock_skip_update &= ~RQCF_REQ_SKIP; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f032fb5284e3..40190f28db35 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9ac1be..479e4436f787 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if (preempt_count() == cnt) { +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); +#endif trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } } EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(); local_irq_enable(); - cond_resched(); + cond_resched_rcu_qs(); return; } local_irq_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index a8c9f5a7dda6..ea9c88109894 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_ENABLE_MANAGEMENT(me); break; case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_DISABLE_MANAGEMENT(me); break; default: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..3f5e183c3d97 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); boot = ktime_add(mono, off_boot); xtim = ktime_add(mono, off_real); - tai = ktime_add(xtim, off_tai); + tai = ktime_add(mono, off_tai); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; @@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 ktime_divns(const ktime_t kt, s64 div) +u64 __ktime_divns(const ktime_t kt, s64 div) { u64 dclc; int sft = 0; @@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } -EXPORT_SYMBOL_GPL(ktime_divns); +EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* @@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next.tv64 < 0) + expires_next.tv64 = 0; + return expires_next; +} +#endif + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, return 0; /* + * When the target cpu of the timer is currently executing + * hrtimer_interrupt(), then we do not touch the clock event + * device. hrtimer_interrupt() will reevaluate all clock bases + * before reprogramming the device. + */ + if (cpu_base->in_hrtirq) + return 0; + + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry * which we enforced in the hang detection. We want the system @@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + ktime_t mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; - int i; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } + if (!hrtimer_hres_active()) + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), + ktime_get()); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock(&cpu_base->lock); entry_time = now = hrtimer_update_base(cpu_base); retry: - expires_next.tv64 = KTIME_MAX; + cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1291,28 +1291,20 @@ retry: * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < 0) - expires.tv64 = KTIME_MAX; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - } __run_hrtimer(timer, &basenow); } } - + /* Reevaluate the clock bases for the next expiry */ + expires_next = __hrtimer_get_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..4b585e0fdd22 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work) getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = timespec64_to_timespec(now); + struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(adjust); + fail = update_persistent_clock(timespec64_to_timespec(adjust)); #endif #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) @@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; + if (txc->modes & ADJ_FREQUENCY) { + if (LONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + return 0; } diff --git a/kernel/time/time.c b/kernel/time/time.c index 6390517e77d4..2c85b7724af4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6a931852082f..b124af259800 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1659,24 +1659,24 @@ out: } /** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set * - * Returns the wall-time of boot in a timespec. + * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ -void getboottime(struct timespec *ts) +void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - *ts = ktime_to_timespec(t); + *ts = ktime_to_timespec64(t); } -EXPORT_SYMBOL_GPL(getboottime); +EXPORT_SYMBOL_GPL(getboottime64); unsigned long get_seconds(void) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 929a733d302e..224e768bdc73 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2497,12 +2497,14 @@ static void ftrace_run_update_code(int command) } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, - struct ftrace_hash *old_hash) + struct ftrace_ops_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; - ops->old_hash.filter_hash = old_hash; + ops->old_hash.filter_hash = old_hash->filter_hash; + ops->old_hash.notrace_hash = old_hash->notrace_hash; ftrace_run_update_code(command); ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3579,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) +static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) { int ret; int i; @@ -3637,6 +3639,7 @@ int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; @@ -3658,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) { count = -ENOMEM; @@ -3718,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - __enable_ftrace_function_probe(old_hash); + __enable_ftrace_function_probe(&old_hash_ops); if (!ret) free_ftrace_hash_rcu(old_hash); @@ -4006,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) } static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_hash *old_hash) + struct ftrace_ops_hash *old_hash) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); } static int @@ -4017,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; + struct ftrace_ops_hash old_hash_ops; struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4053,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops, old_hash); + ftrace_ops_update_code(ops, &old_hash_ops); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4267,6 +4301,7 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4300,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); old_hash = *orig_hash; + old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops, old_hash); + ftrace_ops_update_code(iter->ops, &old_hash_ops); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e767972e99c..4a9079b9f082 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6918,7 +6918,6 @@ void __init trace_init(void) tracepoint_printk = 0; } tracer_alloc_buffers(); - init_ftrace_syscalls(); trace_event_init(); } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4b9c114ee9de..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) } void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) + struct pt_regs **regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; + if (regs) + *regs = this_cpu_ptr(&__perf_regs[*rctxp]); raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 366a78a3e61e..b03a0ea77b99 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2429,12 +2429,39 @@ static __init int event_trace_memsetup(void) return 0; } +static __init void +early_enable_events(struct trace_array *tr, bool disable_first) +{ + char *buf = bootup_event_buf; + char *token; + int ret; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); + + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; - char *buf = bootup_event_buf; - char *token; int ret; if (!tr) @@ -2456,18 +2483,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); - } + early_enable_events(tr, false); trace_printk_start_comm(); @@ -2478,6 +2494,31 @@ static __init int event_trace_enable(void) return 0; } +/* + * event_trace_enable() is called from trace_event_init() first to + * initialize events and perhaps start any events that are on the + * command line. Unfortunately, there are some events that will not + * start this early, like the system call tracepoints that need + * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() + * is called before pid 1 starts, and this flag is never set, making + * the syscall tracepoint never get reached, but the event is enabled + * regardless (and not doing anything). + */ +static __init int event_trace_enable_again(void) +{ + struct trace_array *tr; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + early_enable_events(tr, true); + + return 0; +} + +early_initcall(event_trace_enable_again); + static __init int event_trace_init(void) { struct trace_array *tr; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index b0b1c44e923a..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -132,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_REPEAT_NONE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5edb518be345..296079ae6583 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; @@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c6ee36fcbf90..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size -= sizeof(u32); rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, regs, &rctx); + sys_data->enter_event->event.type, NULL, &rctx); if (!rec) return; @@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size -= sizeof(u32); rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, regs, &rctx); + sys_data->exit_event->event.type, NULL, &rctx); if (!rec) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8520acc34b18..b11441321e7a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) goto out; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6202b08f1933..beeeac9e0e3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { - if (!need_to_create_worker(pool)) - return false; restart: spin_unlock_irq(&pool->lock); @@ -1877,7 +1871,6 @@ restart: */ if (need_to_create_worker(pool)) goto restart; - return true; } /** @@ -1897,16 +1890,14 @@ restart: * multiple times. Does GFP_KERNEL allocations. * * Return: - * %false if the pool don't need management and the caller can safely start - * processing works, %true indicates that the function released pool->lock - * and reacquired it to perform some management function and that the - * conditions that the caller verified while holding the lock before - * calling the function might no longer be true. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Anyone who successfully grabs manager_arb wins the arbitration @@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker) * actual management, the pool may stall indefinitely. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; - ret |= maybe_create_worker(pool); + maybe_create_worker(pool); mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** |