summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-09-03 07:41:11 +0200
committerIngo Molnar <mingo@kernel.org>2013-09-03 07:41:11 +0200
commit7d992feb7694a21ee81f22894b455dadd5d1c110 (patch)
treed0f0961186b1c31c536a26a7f986ad7ca677453b /kernel
parent6e4664525b1db28f8c4e1130957f70a94c19213e (diff)
parent25f27ce4a6a4995c8bdd69b4b2180465ed5ad2b8 (diff)
Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU updates from Paul E. McKenney: " * Update RCU documentation. These were posted to LKML at https://lkml.org/lkml/2013/8/19/611. * Miscellaneous fixes. These were posted to LKML at https://lkml.org/lkml/2013/8/19/619. * Full-system idle detection. This is for use by Frederic Weisbecker's adaptive-ticks mechanism. Its purpose is to allow the timekeeping CPU to shut off its tick when all other CPUs are idle. These were posted to LKML at https://lkml.org/lkml/2013/8/19/648. * Improve rcutorture test coverage. These were posted to LKML at https://lkml.org/lkml/2013/8/19/675. " Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcu.h12
-rw-r--r--kernel/rcupdate.c102
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c396
-rw-r--r--kernel/rcutree.c255
-rw-r--r--kernel/rcutree.h19
-rw-r--r--kernel/rcutree_plugin.h460
-rw-r--r--kernel/time/Kconfig50
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_printk.c19
11 files changed, 873 insertions, 447 deletions
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7f8e7590e3e5..77131966c4ad 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -67,12 +67,15 @@
extern struct debug_obj_descr rcuhead_debug_descr;
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
{
- debug_object_activate(head, &rcuhead_debug_descr);
+ int r1;
+
+ r1 = debug_object_activate(head, &rcuhead_debug_descr);
debug_object_active_state(head, &rcuhead_debug_descr,
STATE_RCU_HEAD_READY,
STATE_RCU_HEAD_QUEUED);
+ return r1;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
debug_object_deactivate(head, &rcuhead_debug_descr);
}
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
{
+ return 0;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
extern void kfree(const void *);
-static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
+static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8bbace..33eb4620aa17 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
}
/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
-{
- struct rcu_head *head = addr;
-
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_init(head, &rcuhead_debug_descr);
- return 1;
- default:
- return 0;
- }
-}
-
-/*
* fixup_activate is called when:
* - an active object is activated
* - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
debug_object_init(head, &rcuhead_debug_descr);
debug_object_activate(head, &rcuhead_debug_descr);
return 0;
-
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_activate(head, &rcuhead_debug_descr);
- return 1;
default:
- return 0;
- }
-}
-
-/*
- * fixup_free is called when:
- * - an active object is freed
- */
-static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
-{
- struct rcu_head *head = addr;
-
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_free(head, &rcuhead_debug_descr);
return 1;
- default:
- return 0;
}
}
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_init = rcuhead_fixup_init,
.fixup_activate = rcuhead_fixup_activate,
- .fixup_free = rcuhead_fixup_free,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index aa344111de3e..9ed6075dc562 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user)
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
- char *rn = NULL;
+ const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
RCU_TRACE(int cb_count = 0);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
- RCU_TRACE(char *name); /* Name of RCU type. */
+ RCU_TRACE(const char *name); /* Name of RCU type. */
};
/* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index f4871e52c546..be63101c6175 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -52,72 +52,78 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
-static int nfakewriters = 4; /* # fake writer threads */
-static int stat_interval = 60; /* Interval between stats, in seconds. */
- /* Zero means "only at end of test". */
-static bool verbose; /* Print more debug info. */
-static bool test_no_idle_hz = true;
- /* Test RCU support for tickless idle CPUs. */
-static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
-static int stutter = 5; /* Start/stop testing interval (in sec) */
-static int irqreader = 1; /* RCU readers from irq (timers). */
-static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff; /* Hold time within burst (us). */
-static int fqs_stutter = 3; /* Wait time between bursts (s). */
-static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
-static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
-static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
-static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
-static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
-static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
-static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
-static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
-static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
-static char *torture_type = "rcu"; /* What RCU implementation to torture. */
-
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+static int fqs_duration;
module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
+static int fqs_holdoff;
module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+static int fqs_stutter = 3;
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+static bool gp_exp;
+module_param(gp_exp, bool, 0444);
+MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
+static bool gp_normal;
+module_param(gp_normal, bool, 0444);
+MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
+static int irqreader = 1;
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+static int n_barrier_cbs;
module_param(n_barrier_cbs, int, 0444);
MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+static int nfakewriters = 4;
+module_param(nfakewriters, int, 0444);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
+static int nreaders = -1;
+module_param(nreaders, int, 0444);
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+static int object_debug;
+module_param(object_debug, int, 0444);
+MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
+static int onoff_holdoff;
module_param(onoff_holdoff, int, 0444);
MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
+static int onoff_interval;
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+static int shuffle_interval = 3;
+module_param(shuffle_interval, int, 0444);
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+static int shutdown_secs;
module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
+static int stall_cpu;
module_param(stall_cpu, int, 0444);
MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
+static int stall_cpu_holdoff = 10;
module_param(stall_cpu_holdoff, int, 0444);
MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
+static int stat_interval = 60;
+module_param(stat_interval, int, 0644);
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+static int stutter = 5;
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+static int test_boost = 1;
module_param(test_boost, int, 0444);
MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+static int test_boost_duration = 4;
module_param(test_boost_duration, int, 0444);
MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
+static int test_boost_interval = 7;
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+static bool test_no_idle_hz = true;
+module_param(test_no_idle_hz, bool, 0444);
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+static bool verbose;
+module_param(verbose, bool, 0444);
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
#define TORTURE_FLAG "-torture:"
#define PRINTK_STRING(s) \
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
* Absorb kthreads into a kernel function that won't return, so that
* they won't ever access module text or data again.
*/
-static void rcutorture_shutdown_absorb(char *title)
+static void rcutorture_shutdown_absorb(const char *title)
{
if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_notice(
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp)
}
static void
-rcu_stutter_wait(char *title)
+rcu_stutter_wait(const char *title)
{
while (stutter_pause_test || !rcutorture_runnable) {
if (rcutorture_runnable)
@@ -360,13 +366,14 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
+ void (*exp_sync)(void);
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
int (*stats)(char *page);
int irq_capable;
int can_boost;
- char *name;
+ const char *name;
};
static struct rcu_torture_ops *cur_ops;
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
call_rcu(&p->rtort_rcu, rcu_torture_cb);
}
-static struct rcu_torture_ops rcu_ops = {
- .init = NULL,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu"
-};
-
-static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
-{
- int i;
- struct rcu_torture *rp;
- struct rcu_torture *rp1;
-
- cur_ops->sync();
- list_add(&p->rtort_free, &rcu_torture_removed);
- list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- list_del(&rp->rtort_free);
- rcu_torture_free(rp);
- }
- }
-}
-
static void rcu_sync_torture_init(void)
{
INIT_LIST_HEAD(&rcu_torture_removed);
}
-static struct rcu_torture_ops rcu_sync_ops = {
+static struct rcu_torture_ops rcu_ops = {
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.completed = rcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
+ .deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
- .call = NULL,
- .cb_barrier = NULL,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu_sync"
-};
-
-static struct rcu_torture_ops rcu_expedited_ops = {
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_expedited,
- .call = NULL,
- .cb_barrier = NULL,
+ .exp_sync = synchronize_rcu_expedited,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
.can_boost = rcu_can_boost(),
- .name = "rcu_expedited"
+ .name = "rcu"
};
/*
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops rcu_bh_ops = {
- .init = NULL,
+ .init = rcu_sync_torture_init,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
.call = call_rcu_bh,
.cb_barrier = rcu_barrier_bh,
.fqs = rcu_bh_force_quiescent_state,
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
.name = "rcu_bh"
};
-static struct rcu_torture_ops rcu_bh_sync_ops = {
- .init = rcu_sync_torture_init,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_bh,
- .call = NULL,
- .cb_barrier = NULL,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh_sync"
-};
-
-static struct rcu_torture_ops rcu_bh_expedited_ops = {
- .init = rcu_sync_torture_init,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_bh_expedited,
- .call = NULL,
- .cb_barrier = NULL,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh_expedited"
-};
-
/*
* Definitions for srcu torture testing.
*/
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page)
return cnt;
}
+static void srcu_torture_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(&srcu_ctl);
+}
+
static struct rcu_torture_ops srcu_ops = {
.init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = {
.completed = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.name = "srcu"
};
-static struct rcu_torture_ops srcu_sync_ops = {
- .init = rcu_sync_torture_init,
- .readlock = srcu_torture_read_lock,
- .read_delay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize,
- .call = NULL,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu_sync"
-};
-
-static void srcu_torture_synchronize_expedited(void)
-{
- synchronize_srcu_expedited(&srcu_ctl);
-}
-
-static struct rcu_torture_ops srcu_expedited_ops = {
- .init = rcu_sync_torture_init,
- .readlock = srcu_torture_read_lock,
- .read_delay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize_expedited,
- .call = NULL,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu_expedited"
-};
-
/*
* Definitions for sched torture testing.
*/
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = {
.completed = rcu_no_completed,
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .call = call_rcu_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
.stats = NULL,
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-static struct rcu_torture_ops sched_sync_ops = {
- .init = rcu_sync_torture_init,
- .readlock = sched_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_sched,
- .cb_barrier = NULL,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
- .name = "sched_sync"
-};
-
-static struct rcu_torture_ops sched_expedited_ops = {
- .init = rcu_sync_torture_init,
- .readlock = sched_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_sched_expedited,
- .cb_barrier = NULL,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "sched_expedited"
-};
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
+ bool exp;
int i;
- long oldbatch = rcu_batches_completed();
struct rcu_torture *rp;
+ struct rcu_torture *rp1;
struct rcu_torture *old_rp;
static DEFINE_RCU_RANDOM(rand);
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- cur_ops->deferred_free(old_rp);
+ if (gp_normal == gp_exp)
+ exp = !!(rcu_random(&rand) & 0x80);
+ else
+ exp = gp_exp;
+ if (!exp) {
+ cur_ops->deferred_free(old_rp);
+ } else {
+ cur_ops->exp_sync();
+ list_add(&old_rp->rtort_free,
+ &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1,
+ &rcu_torture_removed,
+ rtort_free) {
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >=
+ RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+ }
}
rcutorture_record_progress(++rcu_torture_current_version);
- oldbatch = cur_ops->completed();
rcu_stutter_wait("rcu_torture_writer");
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg)
schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
udelay(rcu_random(&rand) & 0x3ff);
if (cur_ops->cb_barrier != NULL &&
- rcu_random(&rand) % (nfakewriters * 8) == 0)
+ rcu_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- else
+ } else if (gp_normal == gp_exp) {
+ if (rcu_random(&rand) & 0x80)
+ cur_ops->sync();
+ else
+ cur_ops->exp_sync();
+ } else if (gp_normal) {
cur_ops->sync();
+ } else {
+ cur_ops->exp_sync();
+ }
rcu_stutter_wait("rcu_torture_fakewriter");
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg)
}
static inline void
-rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
pr_alert("%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg)
torture_type, cpu);
starttime = jiffies;
n_online_attempts++;
- if (cpu_up(cpu) == 0) {
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
if (verbose)
pr_alert("%s" TORTURE_FLAG
"rcu_torture_onoff task: onlined %d\n",
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void)
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
}
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+ /*
+ * This -might- happen due to race conditions, but is unlikely.
+ * The scenario that leads to this happening is that the
+ * first of the pair of duplicate callbacks is queued,
+ * someone else starts a grace period that includes that
+ * callback, then the second of the pair must wait for the
+ * next grace period. Unlikely, but can happen. If it
+ * does happen, the debug-objects subsystem won't have splatted.
+ */
+ pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ struct rcu_head rh1;
+ struct rcu_head rh2;
+
+ init_rcu_head_on_stack(&rh1);
+ init_rcu_head_on_stack(&rh2);
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+
+ /* Try to queue the rh2 pair of callbacks for the same grace period. */
+ preempt_disable(); /* Prevent preemption from interrupting test. */
+ rcu_read_lock(); /* Make it impossible to finish a grace period. */
+ call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ local_irq_disable(); /* Make it harder to start a new grace period. */
+ call_rcu(&rh2, rcu_torture_leak_cb);
+ call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ local_irq_enable();
+ rcu_read_unlock();
+ preempt_enable();
+
+ /* Wait for them all to get done so we can safely return. */
+ rcu_barrier();
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ destroy_rcu_head_on_stack(&rh1);
+ destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+ pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
+
static int __init
rcu_torture_init(void)
{
@@ -1941,11 +1903,9 @@ rcu_torture_init(void)
int cpu;
int firsterr = 0;
int retval;
- static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
- &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
- &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
+ static struct rcu_torture_ops *torture_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ };
mutex_lock(&fullstop_mutex);
@@ -2163,6 +2123,8 @@ rcu_torture_init(void)
firsterr = retval;
goto unwind;
}
+ if (object_debug)
+ rcu_test_debug_objects();
rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 068de3a93606..32618b3fe4e6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -53,18 +53,38 @@
#include <linux/delay.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
+#include <linux/ftrace_event.h>
+#include <linux/suspend.h>
#include "rcutree.h"
#include <trace/events/rcu.h>
#include "rcu.h"
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x) tracepoint_string(x)
+
/* Data structures. */
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
+/*
+ * In order to export the rcu_state name to the tracing tools, it
+ * needs to be added in the __tracepoint_string section.
+ * This requires defining a separate variable tp_<sname>_varname
+ * that points to the string being used, and this will allow
+ * the tracing userspace tools to be able to decipher the string
+ * address to the matching string.
+ */
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+static char sname##_varname[] = #sname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
- .name = #sname, \
+ .name = sname##_varname, \
.abbr = sabbr, \
-}
-
-struct rcu_state rcu_sched_state =
- RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
-DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
+}; \
+DEFINE_PER_CPU(struct rcu_data, sname##_data)
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
+RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
static struct rcu_state *rcu_state;
LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+ trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
rdp->passed_quiesce = 1;
}
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+ trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
rdp->passed_quiesce = 1;
}
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
*/
void rcu_note_context_switch(int cpu)
{
- trace_rcu_utilization("Start context switch");
+ trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
- trace_rcu_utilization("End context switch");
+ trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
bool user)
{
- trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
+ trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
- trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+ trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
@@ -411,6 +435,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
+ rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +453,6 @@ void rcu_user_enter(void)
{
rcu_eqs_enter(1);
}
-
-/**
- * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
- * after the current irq returns.
- *
- * This is similar to rcu_user_enter() but in the context of a non-nesting
- * irq. After this call, RCU enters into idle mode when the interrupt
- * returns.
- */
-void rcu_user_enter_after_irq(void)
-{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- /* Ensure this irq is interrupting a non-idle RCU state. */
- WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
- rdtp->dynticks_nesting = 1;
- local_irq_restore(flags);
-}
#endif /* CONFIG_RCU_USER_QS */
/**
@@ -479,9 +483,10 @@ void rcu_irq_exit(void)
rdtp->dynticks_nesting--;
WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
if (rdtp->dynticks_nesting)
- trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+ trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
rcu_eqs_enter_common(rdtp, oldval, true);
+ rcu_sysidle_enter(rdtp, 1);
local_irq_restore(flags);
}
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
smp_mb__after_atomic_inc(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle(smp_processor_id());
- trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+ trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
- trace_rcu_dyntick("Error on exit: not idle task",
+ trace_rcu_dyntick(TPS("Error on exit: not idle task"),
oldval, rdtp->dynticks_nesting);
ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -550,6 +555,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
+ rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +571,6 @@ void rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
-
-/**
- * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
- * idle mode after the current non-nesting irq returns.
- *
- * This is similar to rcu_user_exit() but in the context of an irq.
- * This is called when the irq has interrupted a userspace RCU idle mode
- * context. When the current non-nesting interrupt returns after this call,
- * the CPU won't restore the RCU idle mode.
- */
-void rcu_user_exit_after_irq(void)
-{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- /* Ensure we are interrupting an RCU idle mode. */
- WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
- rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
- local_irq_restore(flags);
-}
#endif /* CONFIG_RCU_USER_QS */
/**
@@ -620,9 +604,10 @@ void rcu_irq_enter(void)
rdtp->dynticks_nesting++;
WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
if (oldval)
- trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+ trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
rcu_eqs_exit_common(rdtp, oldval, true);
+ rcu_sysidle_exit(rdtp, 1);
local_irq_restore(flags);
}
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
+static int dyntick_save_progress_counter(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
{
rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rcu_sysidle_check_cpu(rdp, isidle, maxj);
return (rdp->dynticks_snap & 0x1) == 0;
}
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
* idle state since the last call to dyntick_save_progress_counter()
* for this same CPU, or by virtue of having been offline.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
{
unsigned int curr;
unsigned int snap;
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* of the current RCU grace period.
*/
if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
return 1;
}
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 0; /* Grace period is not old enough. */
barrier();
if (cpu_is_offline(rdp->cpu)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
return 1;
}
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
* rcu_nocb_wait_gp().
*/
static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, char *s)
+ unsigned long c, const char *s)
{
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
rnp->completed, c, rnp->level,
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
* grace period is already marked as needed, return to the caller.
*/
c = rcu_cbs_completed(rdp->rsp, rnp);
- trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
if (rnp->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
return c;
}
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
if (rnp->gpnum != rnp->completed ||
ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
rnp->need_future_gp[c & 0x1]++;
- trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
return c;
}
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
* recorded, trace and leave.
*/
if (rnp_root->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
goto unlock_out;
}
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
/* If a grace period is not already in progress, start one. */
if (rnp_root->gpnum != rnp_root->completed) {
- trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
} else {
- trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
}
unlock_out:
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
- trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+ trace_rcu_future_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
return needmore;
}
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
}
/*
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
* go looking for one.
*/
rdp->gpnum = rnp->gpnum;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
rdp->passed_quiesce = 0;
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
rsp->gp_flags = 0; /* Clear all flags: New grace period. */
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
- trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
+ trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
record_gp_stall_check_time(rsp);
raw_spin_unlock_irq(&rnp->lock);
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp)
int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
{
int fqs_state = fqs_state_in;
+ bool isidle = false;
+ unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
- force_qs_rnp(rsp, dyntick_save_progress_counter);
+ if (is_sysidle_rcu_state(rsp)) {
+ isidle = 1;
+ maxj = jiffies - ULONG_MAX / 4;
+ }
+ force_qs_rnp(rsp, dyntick_save_progress_counter,
+ &isidle, &maxj);
+ rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+ isidle = 0;
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_nocb_gp_set(rnp, nocb);
rsp->completed = rsp->gpnum; /* Declare grace period done. */
- trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+ trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* We can't do wakeups while holding the rnp->lock, as that
- * could cause possible deadlocks with the rq->lock. Deter
- * the wakeup to interrupt context.
+ * could cause possible deadlocks with the rq->lock. Defer
+ * the wakeup to interrupt context. And don't bother waking
+ * up the running kthread.
*/
- irq_work_queue(&rsp->wakeup_work);
+ if (current != rsp->gp_kthread)
+ irq_work_queue(&rsp->wakeup_work);
}
/*
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
- "cpuofl");
+ TPS("cpuofl"));
}
/*
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
*/
void rcu_check_callbacks(int cpu, int user)
{
- trace_rcu_utilization("Start scheduler-tick");
+ trace_rcu_utilization(TPS("Start scheduler-tick"));
increment_cpu_stall_ticks();
if (user || rcu_is_cpu_rrupt_from_idle()) {
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user)
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
invoke_rcu_core();
- trace_rcu_utilization("End scheduler-tick");
+ trace_rcu_utilization(TPS("End scheduler-tick"));
}
/*
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user)
*
* The caller must have suppressed start of new grace periods.
*/
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj)
{
unsigned long bit;
int cpu;
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0 &&
- f(per_cpu_ptr(rsp->rda, cpu)))
- mask |= bit;
+ if ((rnp->qsmask & bit) != 0) {
+ if ((rnp->qsmaskinit & bit) != 0)
+ *isidle = 0;
+ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ mask |= bit;
+ }
}
if (mask != 0) {
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
if (cpu_is_offline(smp_processor_id()))
return;
- trace_rcu_utilization("Start RCU core");
+ trace_rcu_utilization(TPS("Start RCU core"));
for_each_rcu_flavor(rsp)
__rcu_process_callbacks(rsp);
- trace_rcu_utilization("End RCU core");
+ trace_rcu_utilization(TPS("End RCU core"));
}
/*
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
}
/*
+ * RCU callback function to leak a callback.
+ */
+static void rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
* Helper function for call_rcu() and friends. The cpu argument will
* normally be -1, indicating "currently running CPU". It may specify
* a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_data *rdp;
WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ /* Probable double call_rcu(), so leak the callback. */
+ ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
+ return;
+ }
head->func = func;
head->next = NULL;
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
* Helper function for _rcu_barrier() tracing. If tracing is disabled,
* the compiler is expected to optimize this away.
*/
-static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
int cpu, unsigned long done)
{
trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
* transition. The "if" expression below therefore rounds the old
* value up to the next even number and adds two before comparing.
*/
- snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+ snap_done = rsp->n_barrier_done;
_rcu_barrier_trace(rsp, "Check", -1, snap_done);
- if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+
+ /*
+ * If the value in snap is odd, we needed to wait for the current
+ * rcu_barrier() to complete, then wait for the next one, in other
+ * words, we need the value of snap_done to be three larger than
+ * the value of snap. On the other hand, if the value in snap is
+ * even, we only had to wait for the next rcu_barrier() to complete,
+ * in other words, we need the value of snap_done to be only two
+ * greater than the value of snap. The "(snap + 3) & ~0x1" computes
+ * this for us (thank you, Linus!).
+ */
+ if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
_rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->blimit = blimit;
init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->completed = rnp->completed;
rdp->passed_quiesce = 0;
rdp->qs_pending = 0;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
}
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
@@ -2982,7 +3013,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- trace_rcu_utilization("Start CPU hotplug");
+ trace_rcu_utilization(TPS("Start CPU hotplug"));
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3042,26 @@ static int rcu_cpu_notify(struct notifier_block *self,
default:
break;
}
- trace_rcu_utilization("End CPU hotplug");
+ trace_rcu_utilization(TPS("End CPU hotplug"));
+ return NOTIFY_OK;
+}
+
+static int rcu_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_expedited = 1;
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ rcu_expedited = 0;
+ break;
+ default:
+ break;
+ }
return NOTIFY_OK;
}
@@ -3256,6 +3306,7 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
cpu_notifier(rcu_cpu_notify, 0);
+ pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b3832581043c..5f97eab602cd 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ long long dynticks_idle_nesting;
+ /* irq/process nesting level from idle. */
+ atomic_t dynticks_idle; /* Even value for idle, else odd. */
+ /* "Idle" excludes userspace execution. */
+ unsigned long dynticks_idle_jiffies;
+ /* End of last non-NMI non-idle period. */
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
@@ -445,7 +453,7 @@ struct rcu_state {
/* for CPU stalls. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
- char *name; /* Name of structure. */
+ const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
struct irq_work wakeup_work; /* Postponed wakeups */
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
static void rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
+static void rcu_bind_gp_kthread(void);
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 769e12e3151b..130c97b027f2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/smpboot.h>
-#include <linux/tick.h>
+#include "time/tick-internal.h"
#define RCU_KTHREAD_PRIO 1
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state =
- RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu)
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+ trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
rdp->passed_quiesce = 1;
current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
}
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t)
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
- trace_rcu_unlock_preempted_task("rcu_preempt",
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gpnum, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
rnp->gp_tasks = np;
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t)
*/
empty_exp_now = !rcu_preempted_readers_exp(rnp);
if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
- trace_rcu_quiescent_state_report("preempt_rcu",
+ trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
0, rnp->qsmask,
rnp->level,
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg)
int spincnt = 0;
int more2boost;
- trace_rcu_utilization("Start boost kthread@init");
+ trace_rcu_utilization(TPS("Start boost kthread@init"));
for (;;) {
rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End boost kthread@rcu_wait");
+ trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
- trace_rcu_utilization("Start boost kthread@rcu_wait");
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
more2boost = rcu_boost(rnp);
if (more2boost)
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg)
spincnt = 0;
if (spincnt > 10) {
rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("End boost kthread@rcu_yield");
+ trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
schedule_timeout_interruptible(2);
- trace_rcu_utilization("Start boost kthread@rcu_yield");
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
}
/* NOTREACHED */
- trace_rcu_utilization("End boost kthread@notreached");
+ trace_rcu_utilization(TPS("End boost kthread@notreached"));
return 0;
}
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
int spincnt;
for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization("Start CPU kthread@rcu_wait");
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
rcu_kthread_do_work();
local_bh_enable();
if (*workp == 0) {
- trace_rcu_utilization("End CPU kthread@rcu_wait");
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
}
}
*statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("Start CPU kthread@rcu_yield");
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
schedule_timeout_interruptible(2);
- trace_rcu_utilization("End CPU kthread@rcu_yield");
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
- trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
if (likely(d))
break;
flush_signals(current);
- trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
}
- trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+ trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu)
smp_send_reschedule(cpu);
#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
+
+
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+
+/*
+ * Define RCU flavor that holds sysidle state. This needs to be the
+ * most active flavor of RCU.
+ */
+#ifdef CONFIG_PREEMPT_RCU
+static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+static int full_sysidle_state; /* Current system-idle state. */
+#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
+#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
+#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
+#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
+#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
+
+/*
+ * Invoked to note exit from irq or task transition to idle. Note that
+ * usermode execution does -not- count as idle here! After all, we want
+ * to detect full-system idle states, not RCU quiescent states and grace
+ * periods. The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+ unsigned long j;
+
+ /* Adjust nesting, check for fully idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ if (rdtp->dynticks_idle_nesting != 0)
+ return; /* Still not fully idle. */
+ } else {
+ if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
+ DYNTICK_TASK_NEST_VALUE) {
+ rdtp->dynticks_idle_nesting = 0;
+ } else {
+ rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ return; /* Still not fully idle. */
+ }
+ }
+
+ /* Record start of fully idle period. */
+ j = jiffies;
+ ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ smp_mb__before_atomic_inc();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic_inc();
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
+}
+
+/*
+ * Unconditionally force exit from full system-idle state. This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+ int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int newoldstate;
+
+ /*
+ * Each pass through the following loop attempts to exit full
+ * system-idle state. If contention proves to be a problem,
+ * a trylock-based contention tree could be used here.
+ */
+ while (oldstate > RCU_SYSIDLE_SHORT) {
+ newoldstate = cmpxchg(&full_sysidle_state,
+ oldstate, RCU_SYSIDLE_NOT);
+ if (oldstate == newoldstate &&
+ oldstate == RCU_SYSIDLE_FULL_NOTED) {
+ rcu_kick_nohz_cpu(tick_do_timer_cpu);
+ return; /* We cleared it, done! */
+ }
+ oldstate = newoldstate;
+ }
+ smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+
+/*
+ * Invoked to note entry to irq or task transition from idle. Note that
+ * usermode execution does -not- count as idle here! The caller must
+ * have disabled interrupts.
+ */
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+ /* Adjust nesting, check for already non-idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ if (rdtp->dynticks_idle_nesting != 1)
+ return; /* Already non-idle. */
+ } else {
+ /*
+ * Allow for irq misnesting. Yes, it really is possible
+ * to enter an irq handler then never leave it, and maybe
+ * also vice versa. Handle both possibilities.
+ */
+ if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
+ rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ return; /* Already non-idle. */
+ } else {
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
+ }
+ }
+
+ /* Record end of idle period. */
+ smp_mb__before_atomic_inc();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic_inc();
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+
+ /*
+ * If we are the timekeeping CPU, we are permitted to be non-idle
+ * during a system-idle state. This must be the case, because
+ * the timekeeping CPU has to take scheduling-clock interrupts
+ * during the time that the system is transitioning to full
+ * system-idle state. This means that the timekeeping CPU must
+ * invoke rcu_sysidle_force_exit() directly if it does anything
+ * more than take a scheduling-clock interrupt.
+ */
+ if (smp_processor_id() == tick_do_timer_cpu)
+ return;
+
+ /* Update system-idle state: We are clearly no longer fully idle! */
+ rcu_sysidle_force_exit();
+}
+
+/*
+ * Check to see if the current CPU is idle. Note that usermode execution
+ * does not count as idle. The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+ int cur;
+ unsigned long j;
+ struct rcu_dynticks *rdtp = rdp->dynticks;
+
+ /*
+ * If some other CPU has already reported non-idle, if this is
+ * not the flavor of RCU that tracks sysidle state, or if this
+ * is an offline or the timekeeping CPU, nothing to do.
+ */
+ if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+ cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+ return;
+ if (rcu_gp_in_progress(rdp->rsp))
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+
+ /* Pick up current idle and NMI-nesting counter and check. */
+ cur = atomic_read(&rdtp->dynticks_idle);
+ if (cur & 0x1) {
+ *isidle = false; /* We are not idle! */
+ return;
+ }
+ smp_mb(); /* Read counters before timestamps. */
+
+ /* Pick up timestamps. */
+ j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ /* If this CPU entered idle more recently, update maxj timestamp. */
+ if (ULONG_CMP_LT(*maxj, j))
+ *maxj = j;
+}
+
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return rsp == rcu_sysidle_state;
+}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+ int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+
+ if (cpu < 0 || cpu >= nr_cpu_ids)
+ return;
+ if (raw_smp_processor_id() != cpu)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate. The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return 0;
+ return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+
+/*
+ * Advance the full-system-idle state. This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+ /* Check the current state. */
+ switch (ACCESS_ONCE(full_sysidle_state)) {
+ case RCU_SYSIDLE_NOT:
+
+ /* First time all are idle, so note a short idle period. */
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ break;
+
+ case RCU_SYSIDLE_SHORT:
+
+ /*
+ * Idle for a bit, time to advance to next state?
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
+ break;
+
+ case RCU_SYSIDLE_LONG:
+
+ /*
+ * Do an additional check pass before advancing to full.
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+ smp_mb();
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+ unsigned long maxj, bool gpkt)
+{
+ if (rsp != rcu_sysidle_state)
+ return; /* Wrong flavor, ignore. */
+ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return; /* Running state machine from timekeeping CPU. */
+ if (isidle)
+ rcu_sysidle(maxj); /* More idle! */
+ else
+ rcu_sysidle_cancel(); /* Idle is over. */
+}
+
+/*
+ * Wrapper for rcu_sysidle_report() when called from the grace-period
+ * kthread's context.
+ */
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+ rcu_sysidle_report(rsp, isidle, maxj, true);
+}
+
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+ struct rcu_head rh;
+ int inuse;
+};
+
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+ struct rcu_sysidle_head *rshp;
+
+ /*
+ * The following memory barrier is needed to replace the
+ * memory barriers that would normally be in the memory
+ * allocator.
+ */
+ smp_mb(); /* grace period precedes setting inuse. */
+
+ rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+ ACCESS_ONCE(rshp->inuse) = 0;
+}
+
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts.
+ */
+bool rcu_sys_is_idle(void)
+{
+ static struct rcu_sysidle_head rsh;
+ int rss = ACCESS_ONCE(full_sysidle_state);
+
+ if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
+ return false;
+
+ /* Handle small-system case by doing a full scan of CPUs. */
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
+ int oldrss = rss - 1;
+
+ /*
+ * One pass to advance to each state up to _FULL.
+ * Give up if any pass fails to advance the state.
+ */
+ while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
+ int cpu;
+ bool isidle = true;
+ unsigned long maxj = jiffies - ULONG_MAX / 4;
+ struct rcu_data *rdp;
+
+ /* Scan all the CPUs looking for nonidle CPUs. */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+ rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+ if (!isidle)
+ break;
+ }
+ rcu_sysidle_report(rcu_sysidle_state,
+ isidle, maxj, false);
+ oldrss = rss;
+ rss = ACCESS_ONCE(full_sysidle_state);
+ }
+ }
+
+ /* If this is the first observation of an idle period, record it. */
+ if (rss == RCU_SYSIDLE_FULL) {
+ rss = cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+ return rss == RCU_SYSIDLE_FULL;
+ }
+
+ smp_mb(); /* ensure rss load happens before later caller actions. */
+
+ /* If already fully idle, tell the caller (in case of races). */
+ if (rss == RCU_SYSIDLE_FULL_NOTED)
+ return true;
+
+ /*
+ * If we aren't there yet, and a grace period is not in flight,
+ * initiate a grace period. Either way, tell the caller that
+ * we are not there yet. We use an xchg() rather than an assignment
+ * to make up for the memory barriers that would otherwise be
+ * provided by the memory allocator.
+ */
+ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
+ !rcu_gp_in_progress(rcu_sysidle_state) &&
+ !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+ call_rcu(&rsh.rh, rcu_sysidle_cb);
+ return false;
+}
+
+/*
+ * Initialize dynticks sysidle state for CPUs coming online.
+ */
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
+}
+
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+}
+
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+}
+
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+}
+
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return false;
+}
+
+static void rcu_bind_gp_kthread(void)
+{
+}
+
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+}
+
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..3381f098070f 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -134,6 +134,56 @@ config NO_HZ_FULL_ALL
Note the boot CPU will still be kept outside the range to
handle the timekeeping duty.
+config NO_HZ_FULL_SYSIDLE
+ bool "Detect full-system idle state for full dynticks system"
+ depends on NO_HZ_FULL
+ default n
+ help
+ At least one CPU must keep the scheduling-clock tick running for
+ timekeeping purposes whenever there is a non-idle CPU, where
+ "non-idle" also includes dynticks CPUs as long as they are
+ running non-idle tasks. Because the underlying adaptive-tick
+ support cannot distinguish between all CPUs being idle and
+ all CPUs each running a single task in dynticks mode, the
+ underlying support simply ensures that there is always a CPU
+ handling the scheduling-clock tick, whether or not all CPUs
+ are idle. This Kconfig option enables scalable detection of
+ the all-CPUs-idle state, thus allowing the scheduling-clock
+ tick to be disabled when all CPUs are idle. Note that scalable
+ detection of the all-CPUs-idle state means that larger systems
+ will be slower to declare the all-CPUs-idle state.
+
+ Say Y if you would like to help debug all-CPUs-idle detection.
+
+ Say N if you are unsure.
+
+config NO_HZ_FULL_SYSIDLE_SMALL
+ int "Number of CPUs above which large-system approach is used"
+ depends on NO_HZ_FULL_SYSIDLE
+ range 1 NR_CPUS
+ default 8
+ help
+ The full-system idle detection mechanism takes a lazy approach
+ on large systems, as is required to attain decent scalability.
+ However, on smaller systems, scalability is not anywhere near as
+ large a concern as is energy efficiency. The sysidle subsystem
+ therefore uses a fast but non-scalable algorithm for small
+ systems and a lazier but scalable algorithm for large systems.
+ This Kconfig parameter defines the number of CPUs in the largest
+ system that will be considered to be "small".
+
+ The default value will be fine in most cases. Battery-powered
+ systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
+ numbers of CPUs, and (3) are suffering from battery-lifetime
+ problems due to long sysidle latencies might wish to experiment
+ with larger values for this Kconfig parameter. On the other
+ hand, they might be even better served by disabling NO_HZ_FULL
+ entirely, given that NO_HZ_FULL is intended for HPC and
+ real-time workloads that at present do not tend to be run on
+ battery-powered systems.
+
+ Take the default if you are unsure.
+
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41b0a02..fe39acd4c1aa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+extern const char *__start___tracepoint_str[];
+extern const char *__stop___tracepoint_str[];
+
void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
{
const char **fmt = v;
int start_index;
+ int last_index;
start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
if (*pos < start_index)
return __start___trace_bprintk_fmt + *pos;
+ /*
+ * The __tracepoint_str section is treated the same as the
+ * __trace_printk_fmt section. The difference is that the
+ * __trace_printk_fmt section should only be used by trace_printk()
+ * in a debugging environment, as if anything exists in that section
+ * the trace_prink() helper buffers are allocated, which would just
+ * waste space in a production environment.
+ *
+ * The __tracepoint_str sections on the other hand are used by
+ * tracepoints which need to map pointers to their strings to
+ * the ASCII text for userspace.
+ */
+ last_index = start_index;
+ start_index = __stop___tracepoint_str - __start___tracepoint_str;
+
+ if (*pos < last_index + start_index)
+ return __start___tracepoint_str + (*pos - last_index);
+
return find_next_mod_format(start_index, v, fmt, pos);
}