From 825c5bd2fd47d30148db15fc121216c483682b01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 May 2017 16:16:40 -0700 Subject: srcu: Move rcu_scheduler_starting() from Tiny RCU to Tiny SRCU Other than lockdep support, Tiny RCU has no need for the scheduler status. However, Tiny SRCU will need this to control boot-time behavior independent of lockdep. Therefore, this commit moves rcu_scheduler_starting() from kernel/rcu/tiny_plugin.h to kernel/rcu/srcutiny.c. This in turn allows the complete removal of kernel/rcu/tiny_plugin.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 8 ++++++++ kernel/rcu/tiny.c | 2 -- kernel/rcu/tiny_plugin.h | 47 ----------------------------------------------- 3 files changed, 8 insertions(+), 49 deletions(-) delete mode 100644 kernel/rcu/tiny_plugin.h (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@ #include "rcu_segcblist.h" #include "rcu.h" +int rcu_scheduler_active __read_mostly; + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp) destroy_rcu_head_on_stack(&rs.head); } EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics. */ +void __init rcu_scheduler_starting(void) +{ + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; -#include "tiny_plugin.h" - void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) - ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ -- cgit v1.2.3 From 0d8a1e831e21d955af68f4ae5658b58b7ec25557 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Jun 2017 17:06:38 -0700 Subject: srcu: Make process_srcu() be static The function process_srcu() is not invoked outside of srcutree.c, so this commit makes it static and drops the EXPORT_SYMBOL_GPL(). Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 -- kernel/rcu/srcutree.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42973f787e7e..a026a9493bde 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -104,8 +104,6 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -void process_srcu(struct work_struct *work); - #define __SRCU_STRUCT_INIT(name) \ { \ .sda = &name##_srcu_data, \ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..94bd6ed43ea3 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work); /* * Initialize SRCU combining tree. Note that statically allocated @@ -1194,7 +1195,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* * This is the work-queue function that handles SRCU grace periods. */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -1203,7 +1204,6 @@ void process_srcu(struct work_struct *work) srcu_advance_state(sp); srcu_reschedule(sp, srcu_get_delay(sp)); } -EXPORT_SYMBOL_GPL(process_srcu); void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, -- cgit v1.2.3 From 115a1a5285664f1931c30457081b4ae1e648f1f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 May 2017 13:31:03 -0700 Subject: rcutorture: Move SRCU status printing to SRCU implementations This commit gets rid of some ugly #ifdefs in rcutorture.c by moving the SRCU status printing to the SRCU implementations. Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 13 +++++++++++++ include/linux/srcutree.h | 1 + kernel/rcu/rcutorture.c | 39 +-------------------------------------- kernel/rcu/srcutree.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index cfbfc540cafc..261471f407a5 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -87,4 +87,17 @@ static inline void srcu_barrier(struct srcu_struct *sp) synchronize_srcu(sp); } +/* Defined here to avoid size increase for non-torture kernels. */ +static inline void srcu_torture_stats_print(struct srcu_struct *sp, + char *tt, char *tf) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", + tt, tf, idx, + READ_ONCE(sp->srcu_lock_nesting[!idx]), + READ_ONCE(sp->srcu_lock_nesting[idx])); +} + #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42973f787e7e..7886356bdc84 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -141,5 +141,6 @@ void process_srcu(struct work_struct *work); void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..aedc8f2ad955 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -561,44 +561,7 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int __maybe_unused cpu; - int idx; - -#ifdef CONFIG_TREE_SRCU - idx = srcu_ctlp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *counts; - - counts = per_cpu_ptr(srcu_ctlp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); - } - pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) - idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", - torture_type, TORTURE_FLAG, idx, - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif + srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG); } static void srcu_torture_synchronize_expedited(void) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..8f6fd11c338a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1217,6 +1217,40 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ + int cpu; + int idx; + + idx = sp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *counts; + + counts = per_cpu_ptr(sp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + } + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); + static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); -- cgit v1.2.3 From ac3748c60426602c091c8c9281c95fadb781fcc0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 May 2017 13:59:52 -0700 Subject: rcutorture: Print SRCU lock/unlock totals This commit adds printing of SRCU lock/unlock totals, which are just the sums of the per-CPU counts. Saves a bit of mental arithmetic. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8f6fd11c338a..b4f491b06ee0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1221,6 +1221,7 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) { int cpu; int idx; + unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); @@ -1246,8 +1247,10 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + s0 += c0; + s1 += c1; } - pr_cont("\n"); + pr_cont(" T(%ld,%ld)\n", s0, s1); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); -- cgit v1.2.3 From f1dbc54b929c5917c57b6661c9e37e42868b26a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:23:06 -0700 Subject: rcu: Remove CONFIG_TASKS_RCU ifdef from rcuperf.c The synchronize_rcu_tasks() and call_rcu_tasks() APIs are now available regardless of kernel configuration, so this commit removes the CONFIG_TASKS_RCU ifdef from rcuperf.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks perf testing. */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -#define RCUPERF_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * If performance tests complete, wait for shutdown to commence. */ @@ -658,7 +643,7 @@ rcu_perf_init(void) int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - RCUPERF_TASKS_OPS + &tasks_ops, }; if (!torture_init_begin(perf_type, verbose, &perf_runnable)) -- cgit v1.2.3 From 5e741fa9e9698f4010bb85eff252186f7a4071f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 12:52:44 -0700 Subject: rcutorture: Enable SRCU readers from timer handler Now that it is legal to invoke srcu_read_lock() and srcu_read_unlock() for a given srcu_struct from both process context and {soft,}irq handlers, it is time to test it. This commit therefore enables testing of SRCU readers from rcutorture's timer handler, using in_task() to determine whether or not it is safe to sleep in the SRCU read-side critical sections. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aedc8f2ad955..8d59c82bec0b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -522,7 +522,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) + if (!delay && in_task()) schedule_timeout_interruptible(longdelay); else rcu_read_delay(rrsp); @@ -583,6 +583,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcu" }; @@ -615,6 +616,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcud" }; -- cgit v1.2.3 From b3c983142d4584c9d506b1ed31b65f4292b4aea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 16:39:00 -0700 Subject: rcutorture: Place event-traced strings into trace buffer Strings used in event tracing need to be specially handled, for example, being copied to the trace buffer instead of being pointed to by the trace buffer. Although the TPS() macro can be used to "launder" pointed-to strings, this might not be all that effective within a loadable module. This commit therefore copies rcutorture's strings to the trace buffer. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- include/trace/events/rcu.h | 7 +++++-- kernel/rcu/rcutorture.c | 2 +- tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 91dc089d65b7..e91ae1f2290d 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -703,6 +703,7 @@ TRACE_EVENT(rcu_batch_end, * at the beginning and end of the read, respectively. Note that the * callback address can be NULL. */ +#define RCUTORTURENAME_LEN 8 TRACE_EVENT(rcu_torture_read, TP_PROTO(const char *rcutorturename, struct rcu_head *rhp, @@ -711,7 +712,7 @@ TRACE_EVENT(rcu_torture_read, TP_ARGS(rcutorturename, rhp, secs, c_old, c), TP_STRUCT__entry( - __field(const char *, rcutorturename) + __field(char, rcutorturename[RCUTORTURENAME_LEN]) __field(struct rcu_head *, rhp) __field(unsigned long, secs) __field(unsigned long, c_old) @@ -719,7 +720,9 @@ TRACE_EVENT(rcu_torture_read, ), TP_fast_assign( - __entry->rcutorturename = rcutorturename; + strncpy(__entry->rcutorturename, rcutorturename, + RCUTORTURENAME_LEN); + __entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0; __entry->rhp = rhp; __entry->secs = secs; __entry->c_old = c_old; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8d59c82bec0b..b48d2107f176 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -496,7 +496,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, - .name = "rcu_busted" + .name = "busted" }; /* diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot index 6804f9dcfc1b..be7728db42fd 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_busted +rcutorture.torture_type=busted -- cgit v1.2.3 From 808de39cf422aa08dffd29510d841848ea18e215 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jun 2017 10:03:22 -0700 Subject: rcutorture: Add task's CPU for rcutorture writer stalls It appears that at least some of the rcutorture writer stall messages coincide with unusually long CPU-online operations, for example, no fewer than 205 seconds in a recent test. It is of course possible that the writer stall is not unrelated to this unusually long CPU-hotplug operation, and so this commit adds the rcutorture writer task's CPU to the stall message to gain more information about this possible connection. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b48d2107f176..75ac749ced7c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1319,11 +1319,12 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags, - wtp == NULL ? ~0UL : wtp->state); + wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? -1 : (int)task_cpu(wtp)); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From a3b7b6c2739caf996b95a0164b5b4541c27630c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2017 16:07:17 -0700 Subject: rcutorture: Eliminate unused ts_rem local from rcu_trace_clock_local() This commit removes an unused local variable named ts_rem that is marked __maybe_unused. Yes, the variable was assigned to, but it was never used beyond that point, hence not needed. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 75ac749ced7c..6e3f644280ee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); static u64 notrace rcu_trace_clock_local(void) { u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + (void)do_div(ts, NSEC_PER_USEC); return ts; } #else /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 96036c43066a04c99353abb4a342cc38a52d36f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Jul 2017 13:52:18 -0700 Subject: rcu: Add last-CPU to GP-kthread starvation messages This commit augments the grace-period-kthread starvation debugging messages by adding the last CPU that ran the kthread. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..48c6ab5ca164 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1358,12 +1358,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0, + rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); -- cgit v1.2.3 From f34c8585ed70f0f9b5ff9cf59c0dd533cddb975f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jul 2017 15:27:32 -0700 Subject: rcutorture: Invoke call_rcu() from timer handler The Linux kernel invokes call_rcu() from various interrupt/softirq handlers, but rcutorture does not. This commit therefore adds this behavior to rcutorture's repertoire. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6e3f644280ee..0efd69b2fb8c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1080,6 +1080,11 @@ rcu_torture_fakewriter(void *arg) return 0; } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ + kfree(rhp); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,6 +1147,14 @@ static void rcu_torture_timer(unsigned long unused) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); + + /* Test call_rcu() invocation from interrupt handler. */ + if (cur_ops->call) { + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + + if (rhp) + cur_ops->call(rhp, rcu_torture_timer_cb); + } } /* -- cgit v1.2.3 From 8be6e1b15c54402106e6ba9bc706e685458b2d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 29 Apr 2017 20:03:20 -0700 Subject: rcu: Use timer as backstop for NOCB deferred wakeups The handling of RCU's no-CBs CPUs has a maintenance headache, namely that if call_rcu() is invoked with interrupts disabled, the rcuo kthread wakeup must be defered to a point where we can be sure that scheduler locks are not held. Of course, there are a lot of code paths leading from an interrupts-disabled invocation of call_rcu(), and missing any one of these can result in excessive callback-invocation latency, and potentially even system hangs. This commit therefore uses a timer to guarantee that the wakeup will eventually occur. If one of the deferred-wakeup points kicks in, then the timer is simply cancelled. This commit also fixes up an incomplete removal of commits that were intended to plug remaining exit paths, which should have the added benefit of reducing the overhead of RCU's context-switch hooks. In addition, it simplifies leader-to-follower callback-list handoff by introducing locking. The call_rcu()-to-leader handoff continues to use atomic operations in order to maintain good real-time latency for common-case use of call_rcu(). Signed-off-by: Paul E. McKenney [ paulmck: Dan Carpenter fix for mod_timer() usage bug found by smatch. ] --- kernel/rcu/tree.h | 2 + kernel/rcu/tree_plugin.h | 179 +++++++++++++++++++++++++++++------------------ 2 files changed, 111 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..fe83f684ddcd 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -268,7 +268,9 @@ struct rcu_data { struct rcu_head **nocb_follower_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..bb9e6e43130f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1788,22 +1788,61 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } +/* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + /* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? @@ -1891,11 +1930,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1941,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -2031,6 +2064,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2042,7 +2076,11 @@ wait_again: trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); @@ -2054,7 +2092,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2104,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) + WARN_ON(signal_pending(current)); + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "WokeEmpty"); - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2154,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); } } @@ -2165,6 +2176,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2191,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2241,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2323,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* -- cgit v1.2.3 From f274f1e72d7171c80c8c790040e47a23a74796b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 30 Jun 2017 13:13:59 -0700 Subject: task_work: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in task_work_run() with a spin_lock_irq() and a spin_unlock_irq() aruond the cmpxchg() dequeue loop. This should be safe from a performance perspective because ->pi_lock is local to the task and because calls to the other side of the race, task_work_cancel(), should be rare. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- kernel/task_work.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + raw_spin_lock_irq(&task->pi_lock); do { work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); + raw_spin_unlock_irq(&task->pi_lock); if (!work) break; - /* - * Synchronize with task_work_cancel(). It can't remove - * the first entry == work, cmpxchg(task_works) should - * fail, but it can play with *work and other entries. - */ - raw_spin_unlock_wait(&task->pi_lock); do { next = work->next; -- cgit v1.2.3 From 4a6fc6107e904d4218cf1552bce3f1a59d724ba9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:08:26 -0700 Subject: sched: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in do_task_dead() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock is this tasks ->pi_lock, and this is called only after the task exits. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds [ paulmck: Replace leading smp_mb() with smp_mb__before_spinlock(), courtesy of Arnd Bergmann's noting its odd location. ] --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..1179111d82a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3352,8 +3352,9 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + smp_mb__before_spinlock(); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); -- cgit v1.2.3 From a58163d8ca2c8d288ee9f95989712f98473a5ac2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jun 2017 12:11:34 -0700 Subject: rcu: Migrate callbacks earlier in the CPU-offline timeline RCU callbacks must be migrated away from an outgoing CPU, and this is done near the end of the CPU-hotplug operation, after the outgoing CPU is long gone. Unfortunately, this means that other CPU-hotplug callbacks can execute while the outgoing CPU's callbacks are still immobilized on the long-gone CPU's callback lists. If any of these CPU-hotplug callbacks must wait, either directly or indirectly, for the invocation of any of the immobilized RCU callbacks, the system will hang. This commit avoids such hangs by migrating the callbacks away from the outgoing CPU immediately upon its departure, shortly after the return from __cpu_die() in takedown_cpu(). Thus, RCU is able to advance these callbacks and invoke them, which allows all the after-the-fact CPU-hotplug callbacks to wait on these RCU callbacks without risk of a hang. While in the neighborhood, this commit also moves rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() under a pre-existing #ifdef to avoid including dead code on the one hand and to avoid define-without-use warnings on the other hand. Reported-by: Jeffrey Hugo Link: http://lkml.kernel.org/r/db9c91f6-1b17-6136-84f0-03c3c2581ab4@codeaurora.org Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: Anna-Maria Gleixner Cc: Boris Ostrovsky Cc: Richard Weinberger --- include/linux/rcupdate.h | 1 + kernel/cpu.c | 1 + kernel/rcu/tree.c | 209 +++++++++++++++++++++++++---------------------- 3 files changed, 115 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f816fc72b51e..cf307ebf345d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -110,6 +110,7 @@ void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); void rcu_cpu_starting(unsigned int cpu); +void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..bfbd649ccdc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu) __cpu_die(cpu); tick_cleanup_dead_cpu(cpu); + rcutree_migrate_callbacks(cpu); return 0; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..9bb5dff50815 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2562,85 +2562,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - - /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - /* * Trace the fact that this CPU is going offline. */ @@ -2704,14 +2625,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2639,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - - WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || - !rcu_segcblist_empty(&rdp->cblist), - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", - cpu, rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -3937,6 +3844,116 @@ void rcu_report_dead(unsigned int cpu) for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } + +/* + * Send the specified CPU's RCU callbacks to the orphanage. The + * specified CPU must be offline, and the caller must hold the + * ->orphan_lock. + */ +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, + struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rsp->orphan_lock); + + /* No-CBs CPUs do not have orphanable callbacks. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) + return; + + /* + * Orphan the callbacks. First adjust the counts. This is safe + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. + */ + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); + + /* + * Next, move those callbacks still needing a grace period to + * the orphanage, where some other CPU will pick them up. + * Some of the callbacks might have gone partway through a grace + * period, but that is too bad. They get to start over because we + * cannot assume that grace periods are synchronized across CPUs. + */ + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + + /* + * Then move the ready-to-invoke callbacks to the orphanage, + * where some other CPU will pick them up. These will not be + * required to pass though another grace period: They are done. + */ + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); + + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage. The caller must hold the ->orphan_lock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) +{ + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + + lockdep_assert_held(&rsp->orphan_lock); + + /* No-CBs CPUs are handled specially. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) + return; + + /* Do the accounting first. */ + rdp->n_cbs_adopted += rsp->orphan_done.len; + if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) + rcu_idle_count_callbacks_posted(); + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); + + /* + * We do not need a memory barrier here because the only way we + * can get here if there is an rcu_barrier() in flight is if + * we are the task doing the rcu_barrier(). + */ + + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(rsp->orphan_done.head); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(rsp->orphan_pend.head); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); +} + +/* Orphan the dead CPU's callbacks, and then adopt them. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); + rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation. We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_migrate_callbacks(cpu, rsp); +} #endif /* -- cgit v1.2.3 From 313517fc44fb2d8403654b2d3e511da7d1c78cd6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Jun 2017 16:55:40 -0700 Subject: rcu: Make expedited GPs correctly handle hardware CPU insertion The update of the ->expmaskinitnext and of ->ncpus are unsynchronized, with the value of ->ncpus being incremented long before the corresponding ->expmaskinitnext mask is updated. If an RCU expedited grace period sees ->ncpus change, it will update the ->expmaskinit masks from the new ->expmaskinitnext masks. But it is possible that ->ncpus has already been updated, but the ->expmaskinitnext masks still have their old values. For the current expedited grace period, no harm done. The CPU could not have been online before the grace period started, so there is no need to wait for its non-existent pre-existing readers. But the next RCU expedited grace period is in a world of hurt. The value of ->ncpus has already been updated, so this grace period will assume that the ->expmaskinitnext masks have not changed. But they have, and they won't be taken into account until the next never-been-online CPU comes online. This means that RCU will be ignoring some CPUs that it should be paying attention to. The solution is to update ->ncpus and ->expmaskinitnext while holding the ->lock for the rcu_node structure containing the ->expmaskinitnext mask. Because smp_store_release() is now used to update ->ncpus and smp_load_acquire() is now used to locklessly read it, if the expedited grace period sees ->ncpus change, then the updating CPU has to already be holding the corresponding ->lock. Therefore, when the expedited grace period later acquires that ->lock, it is guaranteed to see the new value of ->expmaskinitnext. On the other hand, if the expedited grace period loads ->ncpus just before an update, earlier full memory barriers guarantee that the incoming CPU isn't far enough along to be running any RCU readers. This commit therefore makes the required change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- kernel/rcu/tree_exp.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9bb5dff50815..f431114bc06a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3684,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (!rdp->beenonline) - WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; @@ -3789,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; + int nbits; + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp; @@ -3799,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); + int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; -- cgit v1.2.3 From a2b2df207acff1e3f965ff2c7c38255b06d583cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jun 2017 15:38:26 -0700 Subject: torture: Fix typo suppressing CPU-hotplug statistics The torture status line contains a series of values preceded by "onoff:". The last value in that line, the one preceding the "HZ=" string, is always zero. The reason that it is always zero is that torture_offline() was incrementing the sum_offl pointer instead of the value that this pointer referenced. This commit therefore makes this increment operate on the statistic rather than the pointer to the statistic. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); (*n_offl_successes)++; delta = jiffies - starttime; - sum_offl += delta; + *sum_offl += delta; if (*min_offl < 0) { *min_offl = delta; *max_offl = delta; -- cgit v1.2.3 From c47e067a3c57835fe5ce24d50482f5c325a64efd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2017 20:30:08 -0700 Subject: rcu: Remove orphan/adopt event-tracing fields The rcu_node structure's ->n_cbs_orphaned and ->n_cbs_adopted fields are updated, but never read. This commit therefore removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- kernel/rcu/tree.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f431114bc06a..f5acf34247fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3871,7 +3871,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* @@ -3910,7 +3909,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) return; /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..aec53c4d4aec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data { /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -- cgit v1.2.3 From 95335c0355834c16cc11f041a981ee6782dba2e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 10:49:50 -0700 Subject: rcu: Check for NOCB CPUs and empty lists earlier in CB migration The current CPU-hotplug RCU-callback-migration code checks for the source (newly offlined) CPU being a NOCBs CPU down in rcu_send_cbs_to_orphanage(). This commit simplifies callback migration a bit by moving this check up to rcu_migrate_callbacks(). This commit also adds a check for the source CPU having no callbacks, which eases analysis of the rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() functions. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f5acf34247fb..aeea697d6f9f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3862,10 +3862,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, { lockdep_assert_held(&rsp->orphan_lock); - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -3935,6 +3931,9 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + return; /* No callbacks to migrate. */ + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); -- cgit v1.2.3 From b1a2d79fe7d210c114003362d93d529912d244df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 12:23:46 -0700 Subject: rcu: Make NOCB CPUs migrate CBs directly from outgoing CPU RCU's CPU-hotplug callback-migration code first moves the outgoing CPU's callbacks to ->orphan_done and ->orphan_pend, and only then moves them to the NOCB callback list. This commit avoids the extra step (and simplifies the code) by moving the callbacks directly from the outgoing CPU's callback list to the NOCB callback list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 31 ++++++++++--------------------- 3 files changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aeea697d6f9f..4ea28e820f4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3899,11 +3899,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) lockdep_assert_held(&rsp->orphan_lock); - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - /* Do the accounting first. */ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); @@ -3928,13 +3923,20 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock(&rsp->orphan_lock); /* irqs already disabled. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index aec53c4d4aec..574513cf49b4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -493,7 +493,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..ff7d5ee49816 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1961,30 +1961,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2459,7 +2448,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { -- cgit v1.2.3 From 9fa46fb8c9c6dfad30487fb3d905c2ff04b379b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 15:43:27 -0700 Subject: rcu: Advance outgoing CPU's callbacks before migrating them It is possible that the outgoing CPU is unaware of recent grace periods, and so it is also possible that some of its pending callbacks are actually ready to be invoked. The current callback-migration code would needlessly force these callbacks to pass through another grace period. This commit therefore invokes rcu_advance_cbs() on the outgoing CPU's callbacks in order to give them full credit for having passed through any recent grace periods. This also fixes an odd theoretical bug where there are no callbacks in the system except for those on the outgoing CPU, none of those callbacks have yet been associated with a grace-period number, there is never again another callback registered, and the surviving CPU never again takes a scheduling-clock interrupt, never goes idle, and never enters nohz_full userspace execution. Yes, this is (just barely) possible. It requires that the surviving CPU be a nohz_full CPU, that its scheduler-clock interrupt be shut off, and that it loop forever in the kernel. You get bonus points if you can make this one happen! ;-) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ea28e820f4a..c080c6ed66af 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3926,6 +3926,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3936,7 +3937,11 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) local_irq_restore(flags); return; } - raw_spin_lock(&rsp->orphan_lock); /* irqs already disabled. */ + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ + raw_spin_unlock_rcu_node(rnp_root); + + raw_spin_lock(&rsp->orphan_lock); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); -- cgit v1.2.3 From 537b85c870babacc1cf13235e92bee9de86210e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 17:59:02 -0700 Subject: rcu: Eliminate rcu_state ->orphan_lock The ->orphan_lock is acquired and released only within the rcu_migrate_callbacks() function, which now acquires the root rcu_node structure's ->lock. This commit therefore eliminates the ->orphan_lock in favor of the root rcu_node structure's ->lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++------------ kernel/rcu/tree.h | 3 --- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c080c6ed66af..58ab489eca66 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,7 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ @@ -3853,15 +3852,12 @@ void rcu_report_dead(unsigned int cpu) /* * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. + * specified CPU must be offline. */ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rsp->orphan_lock); - /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -3891,14 +3887,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, /* * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. + * orphanage. */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - lockdep_assert_held(&rsp->orphan_lock); - /* Do the accounting first. */ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); @@ -3939,12 +3933,9 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - raw_spin_unlock_rcu_node(rnp_root); - - raw_spin_lock(&rsp->orphan_lock); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 574513cf49b4..62b1d0b0d47c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -348,14 +348,11 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ /* (Contains counts.) */ - /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ -- cgit v1.2.3 From 21cc248384aeb0375b3cac164c276c78c503291a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 20:37:51 -0700 Subject: rcu: Advance callbacks after migration When migrating callbacks from a newly offlined CPU, we are already holding the root rcu_node structure's lock, so it costs almost nothing to advance and accelerate the newly migrated callbacks. This patch therefore makes this advancing and acceleration happen. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 58ab489eca66..f9f01aeb5add 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3935,6 +3935,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), -- cgit v1.2.3 From f2dbe4a562d4f17cc1bad3e36a9d1ccb19c86604 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 07:44:06 -0700 Subject: rcu: Localize rcu_state ->orphan_pend and ->orphan_done Given that the rcu_state structure's >orphan_pend and ->orphan_done fields are used only during migration of callbacks from the recently offlined CPU to a surviving CPU, if rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() are combined, these fields can become local variables in the combined function. This commit therefore combines rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() into a new rcu_segcblist_merge() function and removes the ->orphan_pend and ->orphan_done fields. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 24 +++++++++++++++ kernel/rcu/rcu_segcblist.h | 2 ++ kernel/rcu/tree.c | 73 +++------------------------------------------- kernel/rcu/tree.h | 6 ---- 4 files changed, 30 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7091d824b893 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -503,3 +503,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, return true; return false; } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source. Any pending + * callbacks from the source get to start over. It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp) +{ + struct rcu_cblist donecbs; + struct rcu_cblist pendcbs; + + rcu_cblist_init(&donecbs); + rcu_cblist_init(&pendcbs); + rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); + rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); + rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..c2f319f3f06a 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -162,3 +162,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9f01aeb5add..d330c17c8df4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,8 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ - .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -3850,76 +3848,12 @@ void rcu_report_dead(unsigned int cpu) rcu_cleanup_dying_idle_cpu(cpu, rsp); } -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - /* Do the accounting first. */ - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - -/* Orphan the dead CPU's callbacks, and then adopt them. */ +/* Migrate the dead CPU's callbacks to the current CPU. */ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) { unsigned long flags; struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) @@ -3933,15 +3867,16 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 62b1d0b0d47c..b99f31c2b0c3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -348,12 +348,6 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_cblist orphan_done; /* Orphaned callbacks that */ - /* are ready to invoke. */ - /* (Contains counts.) */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ -- cgit v1.2.3 From aed4e046863820e6d06ebf7c079e9ad924608edf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 08:38:45 -0700 Subject: rcu: Remove unused RCU list functions Given changes to callback migration, rcu_cblist_head(), rcu_cblist_tail(), rcu_cblist_count_cbs(), rcu_segcblist_segempty(), rcu_segcblist_dequeued_lazy(), and rcu_segcblist_new_cbs() are no longer used. This commit therefore removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 84 ---------------------------------------------- kernel/rcu/rcu_segcblist.h | 26 -------------- 2 files changed, 110 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7091d824b893..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -35,24 +35,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->len_lazy = 0; } -/* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - /* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but @@ -102,17 +84,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rsclp->tails[RCU_NEXT_TAIL] = NULL; } -/* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? @@ -133,50 +104,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); } -/* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - /* * Return a pointer to the first callback in the specified rcu_segcblist * structure. This is useful for diagnostics. @@ -202,17 +129,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) return NULL; } -/* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - /* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index c2f319f3f06a..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) rclp->len_lazy--; } -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(!rclp->head); - return rclp->tail; -} - void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, -- cgit v1.2.3 From 09efeeee173e9f541b15157d30658cd8b23ec4f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Jul 2017 10:56:46 -0700 Subject: rcu: Move callback-list warning to irq-disable region After adopting callbacks from a newly offlined CPU, the adopting CPU checks to make sure that its callback list's count is zero only if the list has no callbacks and vice versa. Unfortunately, it does so after enabling interrupts, which means that false positives are possible due to interrupt handlers invoking call_rcu(). Although these false positives are improbable, rcutorture did make it happen once. This commit therefore moves this check to an irq-disabled region of code, thus suppressing the false positive. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d330c17c8df4..4b03bddbca3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3869,14 +3869,14 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); } /* -- cgit v1.2.3 From 35732cf9dd38b1efb0f2f22c91c61b51337d1ac3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Jul 2017 13:30:21 -0700 Subject: srcu: Provide ordering for CPU not involved in grace period Tree RCU guarantees that every online CPU has a memory barrier between any given grace period and any of that CPU's RCU read-side sections that must be ordered against that grace period. Since RCU doesn't always know where read-side critical sections are, the actual implementation guarantees order against prior and subsequent non-idle non-offline code, whether in an RCU read-side critical section or not. As a result, there does not need to be a memory barrier at the end of synchronize_rcu() and friends because the ordering internal to the grace period has ordered every CPU's post-grace-period execution against each CPU's pre-grace-period execution, again for all non-idle online CPUs. In contrast, SRCU can have non-idle online CPUs that are completely uninvolved in a given SRCU grace period, for example, a CPU that never runs any SRCU read-side critical sections and took no part in the grace-period processing. It is in theory possible for a given synchronize_srcu()'s wakeup to be delivered to a CPU that was completely uninvolved in the prior SRCU grace period, which could mean that the code following that synchronize_srcu() would end up being unordered with respect to both the grace period and any pre-existing SRCU read-side critical sections. This commit therefore adds an smp_mb() to the end of __synchronize_srcu(), which prevents this scenario from occurring. Reported-by: Lance Roy Signed-off-by: Paul E. McKenney Acked-by: Lance Roy Cc: # 4.12.x --- kernel/rcu/srcutree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 94bd6ed43ea3..c1c0ee3cce3b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -897,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + + /* + * Make sure that later code is ordered after the SRCU grace + * period. This pairs with the raw_spin_lock_irq_rcu_node() + * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed + * because the current CPU might have been totally uninvolved with + * (and thus unordered against) that grace period. + */ + smp_mb(); } /** -- cgit v1.2.3 From 955dbdf4ce87fd9be4bc8378e26b8c2eb8b3d184 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 17 Jun 2017 08:10:08 -0400 Subject: sched: Allow migrating kthreads into online but inactive CPUs Per-cpu workqueues have been tripping CPU affinity sanity checks while a CPU is being offlined. A per-cpu kworker ends up running on a CPU which isn't its target CPU while the CPU is online but inactive. While the scheduler allows kthreads to wake up on an online but inactive CPU, it doesn't allow a running kthread to be migrated to such a CPU, which leads to an odd situation where setting affinity on a sleeping and running kthread leads to different results. Each mem-reclaim workqueue has one rescuer which guarantees forward progress and the rescuer needs to bind itself to the CPU which needs help in making forward progress; however, due to the above issue, while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on the correct CPU if the CPU is in the process of going offline, tripping the sanity check and executing the work item on the wrong CPU. This patch updates __migrate_task() so that kthreads can be migrated into an inactive but online CPU. Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..bfee6ea7db49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) -- cgit v1.2.3 From 0a508183204a983a702c168f23e2121623eb51d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:47:04 -0700 Subject: completion: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in completion_done() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock will be held only the wakeup happens really quickly. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds --- kernel/sched/completion.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..c9524d2d9316 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +309,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); -- cgit v1.2.3 From d859cd40f5e0f1a584e3f60a91cca64f69a82137 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:55:21 -0700 Subject: exit: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in do_exit() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock is a per-task lock, and this is happening only at task-exit time. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..abfbcf66e5c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -819,7 +819,8 @@ void __noreturn do_exit(long code) * Ensure that we must observe the pi_state in exit_mm() -> * mm_release() -> exit_pi_state_list(). */ - raw_spin_unlock_wait(&tsk->pi_lock); + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", -- cgit v1.2.3 From 2a874fd4e96a1a393a802ca295cf477b880ce226 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 15:47:44 -0700 Subject: locking: Remove spin_unlock_wait() generic definitions There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore removes spin_unlock_wait() and related definitions from core code. Signed-off-by: Paul E. McKenney Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds --- include/asm-generic/qspinlock.h | 14 ----- include/linux/spinlock.h | 11 ---- include/linux/spinlock_up.h | 6 --- kernel/locking/qspinlock.c | 117 ---------------------------------------- 4 files changed, 148 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 9f0681bf1e87..66260777d644 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -21,17 +21,6 @@ #include -/** - * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock - * @lock : Pointer to queued spinlock structure - * - * There is a very slight possibility of live-lock if the lockers keep coming - * and the waiter is just unfortunate enough to not see any unlock state. - */ -#ifndef queued_spin_unlock_wait -extern void queued_spin_unlock_wait(struct qspinlock *lock); -#endif - /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure @@ -41,8 +30,6 @@ extern void queued_spin_unlock_wait(struct qspinlock *lock); static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { /* - * See queued_spin_unlock_wait(). - * * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL * isn't immediately observable. */ @@ -135,6 +122,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock) #define arch_spin_trylock(l) queued_spin_trylock(l) #define arch_spin_unlock(l) queued_spin_unlock(l) #define arch_spin_lock_flags(l, f) queued_spin_lock(l) -#define arch_spin_unlock_wait(l) queued_spin_unlock_wait(l) #endif /* __ASM_GENERIC_QSPINLOCK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 59248dcc6ef3..ef018a6e4985 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -130,12 +130,6 @@ do { \ #define smp_mb__before_spinlock() smp_wmb() #endif -/** - * raw_spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. - */ -#define raw_spin_unlock_wait(lock) arch_spin_unlock_wait(&(lock)->raw_lock) - #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) @@ -369,11 +363,6 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock) raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) -static __always_inline void spin_unlock_wait(spinlock_t *lock) -{ - raw_spin_unlock_wait(&lock->rlock); -} - static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 0d9848de677d..612fb530af41 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -26,11 +26,6 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define arch_spin_is_locked(x) ((x)->slock == 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { lock->slock = 0; @@ -73,7 +68,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) -#define arch_spin_unlock_wait(lock) do { barrier(); (void)(lock); } while (0) /* for sched/core.c and kernel_lock.c: */ # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) # define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** -- cgit v1.2.3 From 7a60fabd2f3a07d4ca45f143bb29638e6fa1f8bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2016 06:24:30 -0800 Subject: sched,rcu: Make cond_resched() provide RCU quiescent state There is some confusion as to which of cond_resched() or cond_resched_rcu_qs() should be added to long in-kernel loops. This commit therefore eliminates the decision by adding RCU quiescent states to cond_resched(). This commit also simplifies the code that used to interact with cond_resched_rcu_qs(), and that now interacts with cond_resched(), to reduce its overhead. This reduction is necessary to allow the heavier-weight cond_resched_rcu_qs() mechanism to be invoked everywhere that cond_resched() is invoked. Part of that reduction in overhead converts the jiffies_till_sched_qs kernel parameter to read-only at runtime, thus eliminating the need for bounds checking. Reported-by: Michal Hocko Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra --- include/linux/sched.h | 3 ++- kernel/rcu/tree.c | 25 +++++-------------------- kernel/sched/core.c | 1 + 3 files changed, 8 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..d2f291a3a44a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1522,10 +1522,11 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +void rcu_all_qs(void); #ifndef CONFIG_PREEMPT extern int _cond_resched(void); #else -static inline int _cond_resched(void) { return 0; } +static inline int _cond_resched(void) { rcu_all_qs(); return 0; } #endif #define cond_resched() ({ \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..e40cb5190783 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -537,8 +537,8 @@ module_param(rcu_kick_kthreads, bool, 0644); * How long the grace period must be before we start recruiting * quiescent-state help from rcu_note_context_switch(). */ -static ulong jiffies_till_sched_qs = HZ / 20; -module_param(jiffies_till_sched_qs, ulong, 0644); +static ulong jiffies_till_sched_qs = HZ / 10; +module_param(jiffies_till_sched_qs, ulong, 0444); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); @@ -1230,7 +1230,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - unsigned long rjtsc; struct rcu_node *rnp; /* @@ -1247,23 +1246,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Compute and saturate jiffies_till_sched_qs. */ - jtsq = jiffies_till_sched_qs; - rjtsc = rcu_jiffies_till_stall_check(); - if (jtsq > rjtsc / 2) { - WRITE_ONCE(jiffies_till_sched_qs, rjtsc); - jtsq = rjtsc / 2; - } else if (jtsq < 1) { - WRITE_ONCE(jiffies_till_sched_qs, 1); - jtsq = 1; - } - /* * Has this CPU encountered a cond_resched_rcu_qs() since the * beginning of the grace period? For this to be the case, * the CPU has to have noticed the current grace period. This * might not be the case for nohz_full CPUs looping in the kernel. */ + jtsq = jiffies_till_sched_qs; rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && @@ -1271,7 +1260,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; - } else { + } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ smp_store_release(ruqp, true); } @@ -1299,10 +1288,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * updates are only once every few jiffies, the probability of * lossage (and thus of slight grace-period extension) is * quite low. - * - * Note that if the jiffies_till_sched_qs boot/sysfs parameter - * is set too high, we override with half of the RCU CPU stall - * warning delay. */ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && @@ -1311,7 +1296,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..9433633012ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4808,6 +4808,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); -- cgit v1.2.3 From 60d76227c71b8ce9dcd6109e7ce41b1d20cb9fc4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:05:00 -0700 Subject: rcu: Drive TASKS_RCU directly off of PREEMPT The actual use of TASKS_RCU is only when PREEMPT, otherwise RCU-sched is used instead. This commit therefore makes synchronize_rcu_tasks() and call_rcu_tasks() available always, but mapped to synchronize_sched() and call_rcu_sched(), respectively, when !PREEMPT. This approach also allows some #ifdefs to be removed from rcutorture. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Reviewed-by: Masami Hiramatsu Acked-by: Ingo Molnar --- include/linux/rcupdate.h | 6 ++++-- kernel/rcu/Kconfig | 3 +-- kernel/rcu/rcutorture.c | 17 +---------------- .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 2 +- 4 files changed, 7 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f816fc72b51e..c3f380befdd7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -58,8 +58,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); void call_rcu_bh(struct rcu_head *head, rcu_callback_t func); void call_rcu_sched(struct rcu_head *head, rcu_callback_t func); void synchronize_sched(void); -void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); -void synchronize_rcu_tasks(void); void rcu_barrier_tasks(void); #ifdef CONFIG_PREEMPT_RCU @@ -176,10 +174,14 @@ extern struct srcu_struct tasks_rcu_exit_srcu; rcu_all_qs(); \ rcu_note_voluntary_context_switch_lite(t); \ } while (0) +void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); +void synchronize_rcu_tasks(void); #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) #define rcu_note_voluntary_context_switch_lite(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define call_rcu_tasks call_rcu_sched +#define synchronize_rcu_tasks synchronize_sched #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - bool - default n + def_bool PREEMPT select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..b284c861a511 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -696,8 +696,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks torture testing. */ @@ -735,24 +733,11 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1749,7 +1734,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, RCUTORTURE_TASKS_OPS + &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 9ad3f89c8dc7..af6fca03602f 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -69,11 +69,11 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE CONFIG_PREEMPT_RCU CONFIG_TREE_RCU CONFIG_TINY_RCU +CONFIG_TASKS_RCU These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP. CONFIG_SRCU -CONFIG_TASKS_RCU Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable. -- cgit v1.2.3 From eee99e9701abae30797645dd1cdf376cb855835b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:51:48 -0700 Subject: rcu: Create reasonable API for do_exit() TASKS_RCU processing Currently, the exit-time support for TASKS_RCU is open-coded in do_exit(). This commit creates exit_tasks_rcu_start() and exit_tasks_rcu_finish() APIs for do_exit() use. This has the benefit of confining the use of the tasks_rcu_exit_srcu variable to one file, allowing it to become static. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 7 ++++--- include/linux/sched.h | 5 +++-- kernel/exit.c | 7 ++----- kernel/rcu/update.c | 18 +++++++++++++++++- 4 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c3f380befdd7..ce9d21923d75 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -162,8 +162,6 @@ static inline void rcu_init_nohz(void) { } * macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU -#define TASKS_RCU(x) x -extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch_lite(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ @@ -176,12 +174,15 @@ extern struct srcu_struct tasks_rcu_exit_srcu; } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void exit_tasks_rcu_start(void); +void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU */ -#define TASKS_RCU(x) do { } while (0) #define rcu_note_voluntary_context_switch_lite(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) rcu_all_qs() #define call_rcu_tasks call_rcu_sched #define synchronize_rcu_tasks synchronize_sched +static inline void exit_tasks_rcu_start(void) { } +static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/sched.h b/include/linux/sched.h index d2f291a3a44a..0db4870b477c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -589,9 +589,10 @@ struct task_struct { #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; - bool rcu_tasks_holdout; - struct list_head rcu_tasks_holdout_list; + u8 rcu_tasks_holdout; + u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; + struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ struct sched_info sched_info; diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..d297c525f188 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); kcov_task_exit(tsk); @@ -881,9 +880,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); - TASKS_RCU(preempt_disable()); - TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); - TASKS_RCU(preempt_enable()); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); @@ -918,7 +915,7 @@ void __noreturn do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); - TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + exit_tasks_rcu_finish(); do_task_dead(); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void) mutex_unlock(&rcu_tasks_kthread_mutex); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + #endif /* #ifdef CONFIG_TASKS_RCU */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From c1fd481491e9e3e05d2749c8e68ef8d64ab29b38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 15:49:39 -0700 Subject: rcu: Add TPS() to event-traced strings Strings used in event tracing need to be specially handled, for example, using the TPS() macro. Without the TPS() macro, although output looks fine from within a running kernel, extracting traces from a crash dump produces garbage instead of strings. This commit therefore adds the TPS() macro to some unadorned strings that were passed to event-tracing macros. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bb9e6e43130f..14ba496a13cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2073,7 +2073,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); @@ -2083,7 +2083,7 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2111,7 +2111,7 @@ wait_again: schedule_timeout_interruptible(1); } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); + TPS("WokeEmpty")); } goto wait_again; } @@ -2155,7 +2155,7 @@ wait_again: static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { @@ -2163,7 +2163,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) return; } WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2198,7 +2198,7 @@ static int rcu_nocb_kthread(void *arg) rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, -- cgit v1.2.3 From aa8ea1df73a7d822cfdc71727a9719f1e0ce28e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Jun 2017 16:44:19 -0700 Subject: rcu: Move rcu.h to new trivial-function style This commit saves a few lines in kernel/rcu/rcu.h by moving to single-line definitions for trivial functions, instead of the old style where the two curly braces each get their own line. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 128 +++++++++---------------------------------------------- 1 file changed, 20 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do { \ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = 0; *completed = 0; } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -- cgit v1.2.3 From ddcadb6962f4554da97e5694a7b60f4d55fc4c32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jun 2017 10:32:23 -0700 Subject: rcu: Add event tracing to ->gp_tasks update at GP start There is currently event tracing to track when a task is preempted within a preemptible RCU read-side critical section, and also when that task subsequently reaches its outermost rcu_read_unlock(), but none indicating when a new grace period starts when that grace period must wait on pre-existing readers that have been been preempted at least once since the beginning of their current RCU read-side critical sections. This commit therefore adds an event trace at grace-period start in the case where there are such readers. Note that only the first reader in the list is traced. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 14ba496a13cd..3e3f92e981a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -636,10 +636,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } -- cgit v1.2.3 From 406e7046de0c9e0792f25e47162dff4daf9bc0fd Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 20 Jun 2017 14:45:47 -0700 Subject: rcu: use idle versions of swait to make idle-hack clear These RCU waits were set to use interruptible waits to avoid the kthreads contributing to system load average, even though they are not interruptible as they are spawned from a kthread. Use the new TASK_IDLE swaits which makes our goal clear, and removes confusion about these paths possibly being interruptible -- they are not. When the system is idle the RCU grace-period kthread will spend all its time blocked inside the swait_event_interruptible(). If the interruptible() was not used, then this kthread would contribute to the load average. This means that an idle system would have a load average of 2 (or 3 if PREEMPT=y), rather than the load average of 0 that almost fifty years of UNIX has conditioned sysadmins to expect. The same argument applies to swait_event_interruptible_timeout() use. The RCU grace-period kthread spends its time blocked inside this call while waiting for grace periods to complete. In particular, if there was only one busy CPU, but that CPU was frequently invoking call_rcu(), then the RCU grace-period kthread would spend almost all its time blocked inside the swait_event_interruptible_timeout(). This would mean that the load average would be 2 rather than the expected 1 for the single busy CPU. Acked-by: "Eric W. Biederman" Tested-by: Paul E. McKenney Signed-off-by: Luis R. Rodriguez Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e40cb5190783..8128de9d9dca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2052,8 +2052,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) { @@ -2191,9 +2191,8 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_interruptible(rsp->gp_wq, - READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); + swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) @@ -2224,7 +2223,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ -- cgit v1.2.3 From 41d541ca3cb12c40cd6d6b3fdbcd03b289e35eb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 13:22:17 -0700 Subject: rcu: Add TPS() protection for _rcu_barrier_trace strings The _rcu_barrier_trace() function is a wrapper for trace_rcu_barrier(), which needs TPS() protection for strings passed through the second argument. However, it has escaped prior TPS()-ification efforts because it _rcu_barrier_trace() does not start with "trace_". This commit therefore adds the needed TPS() protection Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8128de9d9dca..62951918c0d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3553,10 +3553,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("LastCB"), -1, + rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); } } @@ -3568,14 +3569,15 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rsp->barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, + rsp->barrier_sequence); } } @@ -3589,14 +3591,15 @@ static void _rcu_barrier(struct rcu_state *rsp) struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, s); + _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); /* Did someone else do our work for us? */ if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, + rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -3604,7 +3607,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Mark the start of the barrier operation. */ rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3627,10 +3630,10 @@ static void _rcu_barrier(struct rcu_state *rsp) rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, rsp->barrier_sequence); } else { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); @@ -3638,11 +3641,11 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_barrier_callback, rsp, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, rsp->barrier_sequence); } } @@ -3659,7 +3662,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); rcu_seq_end(&rsp->barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ -- cgit v1.2.3 From cb6535743270f385db6098969d737a52385af962 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Jun 2017 17:41:25 -0700 Subject: rcu/tracing: Set disable_rcu_irq_enter on rcu_eqs_exit() Set disable_rcu_irq_enter on not only rcu_eqs_enter_common() but also rcu_eqs_exit(), since rcu_eqs_exit() suffers from the same issue as was fixed for rcu_eqs_enter_common() by commit 03ecd3f48e57 ("rcu/tracing: Add rcu_disabled to denote when rcu_irq_enter() will not work"). Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62951918c0d1..5f72f670bc72 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -955,8 +955,10 @@ static void rcu_eqs_exit(bool user) if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(oldval, user); + __this_cpu_dec(disable_rcu_irq_enter); } } -- cgit v1.2.3 From e1d32078bb9487309f0fb90ffb6ca7f708963bdb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Jul 2017 21:52:31 -0700 Subject: rcu: Add assertions verifying blocked-tasks list This commit adds assertions verifying the consistency of the rcu_node structure's ->blkd_tasks list and its ->gp_tasks, ->exp_tasks, and ->boost_tasks pointers. In particular, the ->blkd_tasks lists must be empty except for leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree_plugin.h | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f72f670bc72..2538ea33be4d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2395,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e3f92e981a1..eadf8b95b5e9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* -- cgit v1.2.3 From f8652ac0e940ab2828ddd3def59dc7cfddfaa50d Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 12 Jul 2017 07:59:54 -0700 Subject: rcu: Make rcu_idle_enter() rely on callers disabling irqs All callers to rcu_idle_enter() have irqs disabled, so there is no point in rcu_idle_enter disabling them again. This commit therefore replaces the irq disabling with a RCU_LOCKDEP_WARN(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2538ea33be4d..57d614e7697a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -843,11 +843,8 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - unsigned long flags; - - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); -- cgit v1.2.3 From d61cf245f83eec731c57b9195061803c84696794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jul 2017 09:03:35 -0700 Subject: rcu: Add warning to rcu_idle_enter() for irqs enabled All current callers of rcu_idle_enter() have irqs disabled, and rcu_idle_enter() relies on this, but doesn't check. This commit therefore adds a RCU_LOCKDEP_WARN() to add some verification to the trust. While we are there, pass "true" rather than "1" to rcu_eqs_enter(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 57d614e7697a..083a6029a063 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -859,7 +859,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - rcu_eqs_enter(1); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -- cgit v1.2.3 From 0ca52b12ab268e50ca58a25d6e09d9cd030c390d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jul 2017 11:55:21 -0700 Subject: rcu: Remove exports from rcu_idle_exit() and rcu_idle_enter() The rcu_idle_exit() and rcu_idle_enter() functions are exported because they were originally used by RCU_NONIDLE(), which was intended to be usable from modules. However, RCU_NONIDLE() now instead uses rcu_irq_enter_irqson() and rcu_irq_exit_irqson(), which are not exported, and there have been no complaints. This commit therefore removes the exports from rcu_idle_exit() and rcu_idle_enter(). Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 083a6029a063..9ce1ab065b65 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -846,7 +846,6 @@ void rcu_idle_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -979,7 +978,6 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** -- cgit v1.2.3 From 7391304c4959cd99af241d8e2896d6b05650850c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 28 Jul 2017 16:40:40 -0400 Subject: membarrier: Expedited private command Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built from all runqueues for which current thread's mm is the same as the thread calling sys_membarrier. It executes faster than the non-expedited variant (no blocking). It also works on NOHZ_FULL configurations. Scheduler-wise, it requires a memory barrier before and after context switching between processes (which have different mm). The memory barrier before context switch is already present. For the barrier after context switch: * Our TSO archs can do RELEASE without being a full barrier. Look at x86 spin_unlock() being a regular STORE for example. But for those archs, all atomics imply smp_mb and all of them have atomic ops in switch_mm() for mm_cpumask(), and on x86 the CR3 load acts as a full barrier. * From all weakly ordered machines, only ARM64 and PPC can do RELEASE, the rest does indeed do smp_mb(), so there the spin_unlock() is a full barrier and we're good. * ARM64 has a very heavy barrier in switch_to(), which suffices. * PPC just removed its barrier from switch_to(), but appears to be talking about adding something to switch_mm(). So add a smp_mb__after_unlock_lock() for now, until this is settled on the PPC side. Changes since v3: - Properly document the memory barriers provided by each architecture. Changes since v2: - Address comments from Peter Zijlstra, - Add smp_mb__after_unlock_lock() after finish_lock_switch() in finish_task_switch() to add the memory barrier we need after storing to rq->curr. This is much simpler than the previous approach relying on atomic_dec_and_test() in mmdrop(), which actually added a memory barrier in the common case of switching between userspace processes. - Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full kernel, rather than having the whole membarrier system call returning -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full. Adapt the CMD_QUERY mask accordingly. Changes since v1: - move membarrier code under kernel/sched/ because it uses the scheduler runqueue, - only add the barrier when we switch from a kernel thread. The case where we switch from a user-space thread is already handled by the atomic_dec_and_test() in mmdrop(). - add a comment to mmdrop() documenting the requirement on the implicit memory barrier. CC: Peter Zijlstra CC: Paul E. McKenney CC: Boqun Feng CC: Andrew Hunter CC: Maged Michael CC: gromer@google.com CC: Avi Kivity CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman Signed-off-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Tested-by: Dave Watson --- MAINTAINERS | 2 +- arch/arm64/kernel/process.c | 2 + include/uapi/linux/membarrier.h | 23 +++++- kernel/Makefile | 1 - kernel/membarrier.c | 70 ------------------ kernel/sched/Makefile | 1 + kernel/sched/core.c | 25 +++++++ kernel/sched/membarrier.c | 152 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 202 insertions(+), 74 deletions(-) delete mode 100644 kernel/membarrier.c create mode 100644 kernel/sched/membarrier.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index f66488dfdbc9..3b035584272f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8621,7 +8621,7 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported -F: kernel/membarrier.c +F: kernel/sched/membarrier.c F: include/uapi/linux/membarrier.h MEMORY MANAGEMENT diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 659ae8094ed5..c8f7d98d8cb9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. + * This full barrier is also required by the membarrier system + * call. */ dsb(ish); diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index e0b108bd2624..6d47b3249d8a 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -40,14 +40,33 @@ * (non-running threads are de facto in such a * state). This covers threads from all processes * running on the system. This command returns 0. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: + * Execute a memory barrier on each running + * thread belonging to the same process as the current + * thread. Upon return from system call, the + * caller thread is ensured that all its running + * threads siblings have passed through a state + * where all memory accesses to user-space + * addresses match program order between entry + * to and return from the system call + * (non-running threads are de facto in such a + * state). This only covers threads from the + * same processes as the caller thread. This + * command returns 0. The "expedited" commands + * complete faster than the non-expedited ones, + * they never block, but have the downside of + * causing extra overhead. * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to * the value 0. */ enum membarrier_cmd { - MEMBARRIER_CMD_QUERY = 0, - MEMBARRIER_CMD_SHARED = (1 << 0), + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), + /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ + /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), }; #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - * barrier() smp_mb() sys_membarrier() - * barrier() X X O - * smp_mb() X O O - * sys_membarrier() O O O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ - if (tick_nohz_full_enabled()) - return -ENOSYS; - if (unlikely(flags)) - return -EINVAL; - switch (cmd) { - case MEMBARRIER_CMD_QUERY: - return MEMBARRIER_CMD_BITMASK; - case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) - synchronize_sched(); - return 0; - default: - return -EINVAL; - } -} diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfee6ea7db49..f77269c6c2f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2640,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3329,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} -- cgit v1.2.3