diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 08:13:52 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 08:13:52 -0700 |
commit | 0081a0ce809b611c1f37da5d6ae5bc8027ffd1c4 (patch) | |
tree | 600b4fc3b74d5142fe4a2c2f72d934e6204ca432 /kernel | |
parent | fea1543760351e10e9c573ddf8861c2f23f5b866 (diff) | |
parent | 94edf6f3c20c9c8ee301bde04150a91bab4bf32c (diff) |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnad:
"The main RCU related changes in this cycle were:
- Removal of spin_unlock_wait()
- SRCU updates
- RCU torture-test updates
- RCU Documentation updates
- Extend the sys_membarrier() ABI with the MEMBARRIER_CMD_PRIVATE_EXPEDITED variant
- Miscellaneous RCU fixes
- CPU-hotplug fixes"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (63 commits)
arch: Remove spin_unlock_wait() arch-specific definitions
locking: Remove spin_unlock_wait() generic definitions
drivers/ata: Replace spin_unlock_wait() with lock/unlock pair
ipc: Replace spin_unlock_wait() with lock/unlock pair
exit: Replace spin_unlock_wait() with lock/unlock pair
completion: Replace spin_unlock_wait() with lock/unlock pair
doc: Set down RCU's scheduling-clock-interrupt needs
doc: No longer allowed to use rcu_dereference on non-pointers
doc: Add RCU files to docbook-generation files
doc: Update memory-barriers.txt for read-to-write dependencies
doc: Update RCU documentation
membarrier: Provide expedited private command
rcu: Remove exports from rcu_idle_exit() and rcu_idle_enter()
rcu: Add warning to rcu_idle_enter() for irqs enabled
rcu: Make rcu_idle_enter() rely on callers disabling irqs
rcu: Add assertions verifying blocked-tasks list
rcu/tracing: Set disable_rcu_irq_enter on rcu_eqs_exit()
rcu: Add TPS() protection for _rcu_barrier_trace strings
rcu: Use idle versions of swait to make idle-hack clear
swait: Add idle variants which don't contribute to load average
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/cpu.c | 1 | ||||
-rw-r--r-- | kernel/exit.c | 10 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 117 | ||||
-rw-r--r-- | kernel/membarrier.c | 70 | ||||
-rw-r--r-- | kernel/rcu/Kconfig | 3 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 128 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.c | 108 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.h | 28 | ||||
-rw-r--r-- | kernel/rcu/rcuperf.c | 17 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 83 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 8 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 50 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tiny_plugin.h | 47 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 213 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 15 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 238 | ||||
-rw-r--r-- | kernel/rcu/update.c | 18 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/completion.c | 11 | ||||
-rw-r--r-- | kernel/sched/core.c | 38 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 152 | ||||
-rw-r--r-- | kernel/task_work.c | 8 | ||||
-rw-r--r-- | kernel/torture.c | 2 |
26 files changed, 573 insertions, 798 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..bfbd649ccdc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu) __cpu_die(cpu); tick_cleanup_dead_cpu(cpu); + rcutree_migrate_callbacks(cpu); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..f9ef3ecc78c1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); kcov_task_exit(tsk); @@ -819,7 +818,8 @@ void __noreturn do_exit(long code) * Ensure that we must observe the pi_state in exit_mm() -> * mm_release() -> exit_pi_state_list(). */ - raw_spin_unlock_wait(&tsk->pi_lock); + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -881,9 +881,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); - TASKS_RCU(preempt_disable()); - TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); - TASKS_RCU(preempt_enable()); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); @@ -918,7 +916,7 @@ void __noreturn do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); - TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + exit_tasks_rcu_finish(); do_task_dead(); } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/syscalls.h> -#include <linux/membarrier.h> -#include <linux/tick.h> - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - * barrier() smp_mb() sys_membarrier() - * barrier() X X O - * smp_mb() X O O - * sys_membarrier() O O O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ - if (tick_nohz_full_enabled()) - return -ENOSYS; - if (unlikely(flags)) - return -EINVAL; - switch (cmd) { - case MEMBARRIER_CMD_QUERY: - return MEMBARRIER_CMD_BITMASK; - case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) - synchronize_sched(); - return 0; - default: - return -EINVAL; - } -} diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - bool - default n + def_bool PREEMPT select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do { \ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = 0; *completed = 0; } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp) } /* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - -/* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but * the caller can later invoke rcu_cblist_dequeued_lazy() if it @@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - -/* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? */ @@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) } /* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - -/* * Return a pointer to the first callback in the specified rcu_segcblist * structure. This is useful for diagnostics. */ @@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) } /* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - -/* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, return true; return false; } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source. Any pending + * callbacks from the source get to start over. It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp) +{ + struct rcu_cblist donecbs; + struct rcu_cblist pendcbs; + + rcu_cblist_init(&donecbs); + rcu_cblist_init(&pendcbs); + rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); + rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); + rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) rclp->len_lazy--; } -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(!rclp->head); - return rclp->tail; -} - void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, @@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks perf testing. */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -#define RCUPERF_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * If performance tests complete, wait for shutdown to commence. */ @@ -658,7 +643,7 @@ rcu_perf_init(void) int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - RCUPERF_TASKS_OPS + &tasks_ops, }; if (!torture_init_begin(perf_type, verbose, &perf_runnable)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..45f2ffbc1e78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); static u64 notrace rcu_trace_clock_local(void) { u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + (void)do_div(ts, NSEC_PER_USEC); return ts; } #else /* #ifdef CONFIG_RCU_TRACE */ @@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, - .name = "rcu_busted" + .name = "busted" }; /* @@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) + if (!delay && in_task()) schedule_timeout_interruptible(longdelay); else rcu_read_delay(rrsp); @@ -561,44 +562,7 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int __maybe_unused cpu; - int idx; - -#ifdef CONFIG_TREE_SRCU - idx = srcu_ctlp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *counts; - - counts = per_cpu_ptr(srcu_ctlp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); - } - pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) - idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", - torture_type, TORTURE_FLAG, idx, - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif + srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG); } static void srcu_torture_synchronize_expedited(void) @@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcu" }; @@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcud" }; @@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks torture testing. */ @@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg) return 0; } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ + kfree(rhp); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); + + /* Test call_rcu() invocation from interrupt handler. */ + if (cur_ops->call) { + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + + if (rhp) + cur_ops->call(rhp, rcu_torture_timer_cb); + } } /* @@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags, - wtp == NULL ? ~0UL : wtp->state); + wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? -1 : (int)task_cpu(wtp)); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1749,7 +1714,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, RCUTORTURE_TASKS_OPS + &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@ #include "rcu_segcblist.h" #include "rcu.h" +int rcu_scheduler_active __read_mostly; + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp) destroy_rcu_head_on_stack(&rs.head); } EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics. */ +void __init rcu_scheduler_starting(void) +{ + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..729a8706751d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work); /* * Initialize SRCU combining tree. Note that statically allocated @@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + + /* + * Make sure that later code is ordered after the SRCU grace + * period. This pairs with the raw_spin_lock_irq_rcu_node() + * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed + * because the current CPU might have been totally uninvolved with + * (and thus unordered against) that grace period. + */ + smp_mb(); } /** @@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* * This is the work-queue function that handles SRCU grace periods. */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work) srcu_advance_state(sp); srcu_reschedule(sp, srcu_get_delay(sp)); } -EXPORT_SYMBOL_GPL(process_srcu); void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, @@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ + int cpu; + int idx; + unsigned long s0 = 0, s1 = 0; + + idx = sp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *counts; + + counts = per_cpu_ptr(sp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); + static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; -#include "tiny_plugin.h" - void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include <linux/kernel_stat.h> - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) - ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..84fe96641b2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ - .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - unsigned long flags; - - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); - local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - rcu_eqs_enter(1); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user) if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(oldval, user); + __this_cpu_dec(disable_rcu_irq_enter); } } @@ -979,7 +975,6 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** @@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0, + rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); @@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) { @@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_interruptible(rsp->gp_wq, - READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); + swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) @@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - - /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - -/* * Trace the fact that this CPU is going offline. */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) @@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - - WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || - !rcu_segcblist_empty(&rdp->cblist), - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", - cpu, rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("LastCB"), -1, + rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); } } @@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rsp->barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, + rsp->barrier_sequence); } } @@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp) struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, s); + _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); /* Did someone else do our work for us? */ if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, + rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Mark the start of the barrier operation. */ rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp) rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, rsp->barrier_sequence); } else { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); @@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_barrier_callback, rsp, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, rsp->barrier_sequence); } } @@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); rcu_seq_end(&rsp->barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ @@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (!rdp->beenonline) - WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; @@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; + int nbits; + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp; @@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU @@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu) for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } + +/* Migrate the dead CPU's callbacks to the current CPU. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *my_rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + return; /* No callbacks to migrate. */ + + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ + rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); + raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation. We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_migrate_callbacks(cpu, rsp); +} #endif /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..8e1f285f0a70 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data { /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -268,7 +266,9 @@ struct rcu_data { struct rcu_head **nocb_follower_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -350,15 +350,6 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ - struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_cblist orphan_done; /* Orphaned callbacks that */ - /* are ready to invoke. */ - /* (Contains counts.) */ - /* End of fields guarded by orphan_lock. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ @@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); + int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..55bde94b9572 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* @@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } @@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } /* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + +/* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? */ @@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2039,13 +2076,17 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2054,7 +2095,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2107,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + TPS("WokeEmpty")); + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2157,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* @@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void) mutex_unlock(&rcu_tasks_kthread_mutex); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + #endif /* #ifdef CONFIG_TASKS_RCU */ #ifndef CONFIG_TINY_RCU diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..c9524d2d9316 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +309,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..e053c31d96da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) @@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); @@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> +#include <linux/tick.h> +#include <linux/cpumask.h> + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + raw_spin_lock_irq(&task->pi_lock); do { work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); + raw_spin_unlock_irq(&task->pi_lock); if (!work) break; - /* - * Synchronize with task_work_cancel(). It can't remove - * the first entry == work, cmpxchg(task_works) should - * fail, but it can play with *work and other entries. - */ - raw_spin_unlock_wait(&task->pi_lock); do { next = work->next; diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); (*n_offl_successes)++; delta = jiffies - starttime; - sum_offl += delta; + *sum_offl += delta; if (*min_offl < 0) { *min_offl = delta; *max_offl = delta; |