diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 09:59:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 09:59:09 -0700 |
commit | 8747a29173c6eb6f4b3e8d3b3bcabc0fa132678a (patch) | |
tree | e9c58072f6b681788c4692cb5b219e8d05bef330 | |
parent | cc67ccecd3e6e2827b6706bad3287786202498f5 (diff) | |
parent | c4fb5f37001514c0004d17b79cf74a499b9bc320 (diff) |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
"The main RCU subsystem changes in this cycle were:
- Miscellaneous fixes, perhaps most notably removing obsolete code
whose only purpose in life was to gather information for the
now-removed RCU debugfs facility. Other notable changes include
removing NO_HZ_FULL_ALL in favor of the nohz_full kernel boot
parameter, minor optimizations for expedited grace periods, some
added tracing, creating an RCU-specific workqueue using Tejun's new
WQ_MEM_RECLAIM flag, and several cleanups to code and comments.
- SRCU cleanups and optimizations.
- Torture-test updates, perhaps most notably the adding of ARMv8
support, but also including numerous cleanups and usability fixes"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
rcu: Create RCU-specific workqueues with rescuers
torture: Provide more sensible nreader/nwriter defaults for rcuperf
torture: Grace periods do not piggyback off of themselves
torture: Adjust rcuperf trace processing to allow for workqueues
torture: Default jitter off when running rcuperf
torture: Specify qemu memory size with --memory argument
rcutorture: Add basic ARM64 support to run scripts
rcutorture: Update kvm.sh header comment
rcutorture: Record which grace-period primitives are tested
rcutorture: Re-enable testing of dynamic expediting
rcutorture: Avoid fake-writer use of undefined primitives
rcutorture: Abstract function and module names
rcutorture: Replace multi-instance kzalloc() with kcalloc()
rcu: Remove SRCU throttling
srcu: Remove dead code in srcu_gp_end()
srcu: Reduce scans of srcu_data in counter wrap check
srcu: Prevent sdp->srcu_gp_seq_needed_exp counter wrap
srcu: Abstract function name
rcu: Make expedited RCU CPU selection avoid unnecessary stores
rcu: Trace expedited GP delays due to transitioning CPUs
...
25 files changed, 238 insertions, 242 deletions
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt index 2dcaf9adb7a7..9591092da5e0 100644 --- a/Documentation/timers/NO_HZ.txt +++ b/Documentation/timers/NO_HZ.txt @@ -131,13 +131,6 @@ error message, and the boot CPU will be removed from the mask. Note that this means that your system must have at least two CPUs in order for CONFIG_NO_HZ_FULL=y to do anything for you. -Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies -that all CPUs other than the boot CPU are adaptive-ticks CPUs. This -Kconfig parameter will be overridden by the "nohz_full=" boot parameter, -so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and -the "nohz_full=1" boot parameter is specified, the boot parameter will -prevail so that only CPU 1 will be an adaptive-ticks CPU. - Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded. This is covered in the "RCU IMPLICATIONS" section below. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 043d04784675..36360d07f25b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -214,10 +214,12 @@ do { \ #endif /* - * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic - * initialization and destruction of rcu_head on the stack. rcu_head structures - * allocated dynamically in the heap or defined statically don't need any - * initialization. + * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls + * are needed for dynamic initialization and destruction of rcu_head + * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for + * dynamic initialization and destruction of statically allocated rcu_head + * structures. However, rcu_head structures allocated dynamically in the + * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); diff --git a/include/linux/types.h b/include/linux/types.h index c94d59ef96cc..ec13d02b3481 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -217,7 +217,7 @@ struct ustat { * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; - * - the structure shares storage spacer in struct page with @compound_head, + * - the structure shares storage space in struct page with @compound_head, * which encode PageTail() in bit 0. The guarantee is needed to avoid * false-positive PageTail(). */ diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 0b50fda80db0..d8c33298c153 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -179,6 +179,10 @@ TRACE_EVENT(rcu_grace_period_init, * * "snap": Captured snapshot of expedited grace period sequence number. * "start": Started a real expedited grace period. + * "reset": Started resetting the tree + * "select": Started selecting the CPUs to wait on. + * "selectofl": Selected CPU partially offline. + * "startwait": Started waiting on selected CPUs. * "end": Ended a real expedited grace period. * "endwake": Woke piggybackers up. * "done": Someone else did the expedited grace period for us. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) WARN_ON_ONCE(rcu_seq_state(*sp) != 1); } +/* Compute the end-of-grace-period value for the specified sequence number. */ +static inline unsigned long rcu_seq_endval(unsigned long *sp) +{ + return (*sp | RCU_SEQ_STATE_MASK) + 1; +} + /* Adjust sequence number for end of update-side operation. */ static inline void rcu_seq_end(unsigned long *sp) { smp_mb(); /* Ensure update-side operation before counter increment. */ WARN_ON_ONCE(!rcu_seq_state(*sp)); - WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); + WRITE_ONCE(*sp, rcu_seq_endval(sp)); } /* Take a snapshot of the update side's sequence number. */ @@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) + for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + (cpu) <= rnp->grphi; \ + (cpu) = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Iterate over all CPUs in a leaf RCU node's specified mask. + */ +#define rcu_find_next_bit(rnp, cpu, mask) \ + ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) +#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ + for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + (cpu) <= rnp->grphi; \ + (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) /* * Wrappers for the rcu_node::lock acquire and release. @@ -337,7 +353,7 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -348,6 +364,9 @@ do { \ ___locked; \ }) +#define raw_lockdep_assert_held_rcu_node(p) \ + lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU @@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 -#ifdef CONFIG_TINY_RCU -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } -#else /* #ifdef CONFIG_TINY_RCU */ -void rcu_request_urgent_qs_task(struct task_struct *t); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, @@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + */ + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, 0, "Number of RCU reader threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) { + if (!can_expedite) pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Disabled dynamic grace-period expediting.\n", - torture_type); - } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; - else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) - pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); - if (gp_exp1 && cur_ops->exp_sync) + pr_info("%s: Testing conditional GPs.\n", __func__); + } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + pr_alert("%s: gp_cond without primitives.\n", __func__); + } + if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; - else if (gp_exp && !cur_ops->exp_sync) - pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); - if (gp_normal1 && cur_ops->deferred_free) + pr_info("%s: Testing expedited GPs.\n", __func__); + } else if (gp_exp && !cur_ops->exp_sync) { + pr_alert("%s: gp_exp without primitives.\n", __func__); + } + if (gp_normal1 && cur_ops->deferred_free) { synctype[nsynctypes++] = RTWS_DEF_FREE; - else if (gp_normal && !cur_ops->deferred_free) - pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); - if (gp_sync1 && cur_ops->sync) + pr_info("%s: Testing asynchronous GPs.\n", __func__); + } else if (gp_normal && !cur_ops->deferred_free) { + pr_alert("%s: gp_normal without primitives.\n", __func__); + } + if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; - else if (gp_sync && !cur_ops->sync) - pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + pr_info("%s: Testing normal GPs.\n", __func__); + } else if (gp_sync && !cur_ops->sync) { + pr_alert("%s: gp_sync without primitives.\n", __func__); + } if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) rcu_unexpedite_gp(); if (++expediting > 3) expediting = -expediting; + } else if (!can_expedite) { /* Disabled during boot, recheck. */ + can_expedite = !rcu_gp_is_expedited() && + !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); @@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) while (can_expedite && expediting++ < 0) rcu_unexpedite_gp(); WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " Dynamic grace-period expediting was disabled.\n", + torture_type); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (torture_random(&rand) & 0x80) + if (cur_ops->sync && torture_random(&rand) & 0x80) cur_ops->sync(); - else + else if (cur_ops->exp_sync) cur_ops->exp_sync(); - } else if (gp_normal) { + } else if (gp_normal && cur_ops->sync) { cur_ops->sync(); - } else { + } else if (cur_ops->exp_sync) { cur_ops->exp_sync(); } stutter_wait("rcu_torture_fakewriter"); @@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) atomic_set(&barrier_cbs_count, 0); atomic_set(&barrier_cbs_invoked, 0); barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), GFP_KERNEL); barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { @@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) * next grace period. Unlikely, but can happen. If it * does happen, the debug-objects subsystem won't have splatted. */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); + pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ @@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) /* Wait for them all to get done so we can safely return. */ rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } @@ -1799,7 +1809,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; if (nfakewriters > 0) { - fakewriter_tasks = kzalloc(nfakewriters * + fakewriter_tasks = kcalloc(nfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1814,7 +1824,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("out of memory"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..fb560fca9ef4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - lockdep_assert_held(&sp->lock); + lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, - &sdp->work, delay); + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); } /* @@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + bool last_lvl; int cpu; unsigned long flags; unsigned long gpseq; int idx; - int idxnext; unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; @@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - if (snp >= sp->level[rcu_num_lvls - 1]) + last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); @@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check)) + if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); spin_unlock_irq_rcu_node(sp); - /* Throttle expedited grace periods: Should be rare! */ - srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff - ? 0 : SRCU_INTERVAL); + srcu_reschedule(sp, 0); } else { spin_unlock_irq_rcu_node(sp); } @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_rcu_node(sp, flags); - if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, - srcu_get_delay(sp)); + queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_kthread ? rsp->gp_kthread->state : ~0, rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { + pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); } @@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * If RCU is idle, we just wait for the next grace period. @@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * Pick up grace-period number for new callbacks. If this @@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -2296,7 +2297,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - lockdep_assert_held(&rcu_get_root(rsp)->lock); + raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Walk up the rcu_node hierarchy. */ for (;;) { @@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->n_cbs_invoked += count; rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ @@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; + if (ret) return; - } rnp_old = rnp; } /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ @@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } @@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - rdp->n_rcu_pending++; - /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); @@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { - rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { - rdp->n_rp_report_qs++; + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) return 1; - } /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rdp->n_rp_cb_ready++; + if (rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; - } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; + if (cpu_needs_another_gp(rsp, rdp)) return 1; - } /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; + if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ return 1; - } /* Has a new RCU grace period started? */ if (READ_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ - rdp->n_rp_gp_started++; + unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - } /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) { - rdp->n_rp_nocb_defer_wakeup++; + if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - } /* nothing to do */ - rdp->n_rp_need_nothing++; return 0; } @@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); @@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_wq; + void __init rcu_init(void) { int cpu; @@ -4219,6 +4195,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -146,12 +146,6 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -184,13 +178,6 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -217,8 +204,6 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -234,18 +219,7 @@ struct rcu_data { /* Grace period that needs help */ /* from cond_resched(). */ - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_core_needs_qs; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_nocb_defer_wakeup; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ + /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; @@ -256,7 +230,7 @@ struct rcu_data { atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 7) Callback offloading. */ + /* 6) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -283,7 +257,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 8) RCU CPU stall data. */ + /* 7) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -374,10 +348,6 @@ struct rcu_state { /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) } /* + * Return then value that expedited-grace-period counter will have + * at the end of the current grace period. + */ +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +{ + return rcu_seq_endval(&rsp->expedited_sequence); +} + +/* * Record the end of an expedited grace period. */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) @@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, int ret; struct rcu_node *rnp; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; - rdp->exp_dynticks_snap = - rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || - !(rnp->qsmaskinitnext & rdp->grpmask)) - mask_ofl_test |= rdp->grpmask; + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) + mask_ofl_test |= mask; + else + rdp->exp_dynticks_snap = snap; + } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); @@ -417,6 +435,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; @@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); sched_show_task(t); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) { + if (rnp->exp_tasks != NULL) tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { + else tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to @@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) char *ticks_title; unsigned long ticks_value; + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + if (rsp->gpnum == rdp->gpnum) { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; @@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - rdp->n_nocbs_invoked += c; } return 0; } @@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -113,16 +113,6 @@ config NO_HZ_FULL endchoice -config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default (except CPU 0)" - depends on NO_HZ_FULL - help - If the user doesn't pass the nohz_full boot option to - define the range of full dynticks CPUs, consider that all - CPUs in the system are full dynticks by default. - Note the boot CPU will still be kept outside the range to - handle the timekeeping duty. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..ccd3782da0bf 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) return 0; } -static int tick_nohz_init_all(void) -{ - int err = -1; - -#ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - err = 0; - cpumask_setall(tick_nohz_full_mask); - tick_nohz_full_running = true; -#endif - return err; -} - void __init tick_nohz_init(void) { int cpu, ret; - if (!tick_nohz_full_running) { - if (tick_nohz_init_all() < 0) - return; - } + if (!tick_nohz_full_running) + return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 07a13779eece..65f6655026f0 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -136,6 +136,9 @@ identify_boot_image () { qemu-system-x86_64|qemu-system-i386) echo arch/x86/boot/bzImage ;; + qemu-system-aarch64) + echo arch/arm64/boot/Image + ;; *) echo vmlinux ;; @@ -158,6 +161,9 @@ identify_qemu () { elif echo $u | grep -q "Intel 80386" then echo qemu-system-i386 + elif echo $u | grep -q aarch64 + then + echo qemu-system-aarch64 elif uname -a | grep -q ppc64 then echo qemu-system-ppc64 @@ -176,16 +182,20 @@ identify_qemu () { # Output arguments for the qemu "-append" string based on CPU type # and the TORTURE_QEMU_INTERACTIVE environment variable. identify_qemu_append () { + local console=ttyS0 case "$1" in qemu-system-x86_64|qemu-system-i386) echo noapic selinux=0 initcall_debug debug ;; + qemu-system-aarch64) + console=ttyAMA0 + ;; esac if test -n "$TORTURE_QEMU_INTERACTIVE" then echo root=/dev/sda else - echo console=ttyS0 + echo console=$console fi } @@ -197,6 +207,9 @@ identify_qemu_args () { case "$1" in qemu-system-x86_64|qemu-system-i386) ;; + qemu-system-aarch64) + echo -machine virt,gic-version=host -cpu host + ;; qemu-system-ppc64) echo -enable-kvm -M pseries -nodefaults echo -device spapr-vscsi @@ -254,7 +267,7 @@ specify_qemu_cpus () { echo $2 else case "$1" in - qemu-system-x86_64|qemu-system-i386) + qemu-system-x86_64|qemu-system-i386|qemu-system-aarch64) echo $2 -smp $3 ;; qemu-system-ppc64) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh index 963f71289d22..8948f7926b21 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -39,30 +39,31 @@ sed -e 's/us : / : /' | tr -d '\015' | awk ' $8 == "start" { - if (starttask != "") + if (startseq != "") nlost++; starttask = $1; starttime = $3; startseq = $7; + seqtask[startseq] = starttask; } $8 == "end" { - if (starttask == $1 && startseq == $7) { + if (startseq == $7) { curgpdur = $3 - starttime; gptimes[++n] = curgpdur; gptaskcnt[starttask]++; sum += curgpdur; if (curgpdur > 1000) print "Long GP " starttime "us to " $3 "us (" curgpdur "us)"; - starttask = ""; + startseq = ""; } else { # Lost a message or some such, reset. - starttask = ""; + startseq = ""; nlost++; } } -$8 == "done" { +$8 == "done" && seqtask[$7] != $1 { piggybackcnt[$1]++; } diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 1b78a12740e5..5f8fbb0d7c17 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -177,8 +177,8 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7d1f607f0f76..56610dbbdf73 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -1,10 +1,8 @@ #!/bin/bash # -# Run a series of 14 tests under KVM. These are not particularly -# well-selected or well-tuned, but are the current set. -# -# Edit the definitions below to set the locations of the various directories, -# as well as the test duration. +# Run a series of tests under KVM. By default, this series is specified +# by the relevant CFLIST file, but can be overridden by the --configs +# command-line argument. # # Usage: kvm.sh [ options ] # @@ -44,6 +42,7 @@ TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" TORTURE_KMAKE_ARG="" +TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu resdir="" @@ -70,6 +69,7 @@ usage () { echo " --kconfig Kconfig-options" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" + echo " --memory megabytes | nnnG" echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." @@ -147,6 +147,11 @@ do TORTURE_QEMU_MAC=$2 shift ;; + --memory) + checkarg --memory "(memory size)" $# "$2" '^[0-9]\+[MG]\?$' error + TORTURE_QEMU_MEM=$2 + shift + ;; --no-initrd) TORTURE_INITRD=""; export TORTURE_INITRD ;; @@ -174,6 +179,12 @@ do checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' TORTURE_SUITE=$2 shift + if test "$TORTURE_SUITE" = rcuperf + then + # If you really want jitter for rcuperf, specify + # it after specifying rcuperf. (But why?) + jitter=0 + fi ;; *) echo Unknown argument $1 @@ -288,6 +299,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC +TORTURE_QEMU_MEM="$TORTURE_QEMU_MEM"; export TORTURE_QEMU_MEM TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE if ! test -e $resdir diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index c70c51d5ded1..28568b72a31b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -9,5 +9,4 @@ CONFIG_PREEMPT=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=y #CHECK#CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot index cd2a188eeb6d..838297c58318 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot @@ -1 +1 @@ -rcutorture.torture_type=tasks +rcutorture.torture_type=tasks nohz_full=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 27d22695d64c..24c9f6012e35 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -7,7 +7,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index e34c33430447..e6071bb96c7d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 nohz_full=1-7 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 0f4759f4232e..d7afb271a586 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -7,7 +7,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=n CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh index b9603115d7c7..d36b8fd6f0fc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -20,32 +20,10 @@ # # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> -# rcuperf_param_nreaders bootparam-string -# -# Adds nreaders rcuperf module parameter if not already specified. -rcuperf_param_nreaders () { - if ! echo "$1" | grep -q "rcuperf.nreaders" - then - echo rcuperf.nreaders=-1 - fi -} - -# rcuperf_param_nwriters bootparam-string -# -# Adds nwriters rcuperf module parameter if not already specified. -rcuperf_param_nwriters () { - if ! echo "$1" | grep -q "rcuperf.nwriters" - then - echo rcuperf.nwriters=-1 - fi -} - # per_version_boot_params bootparam-string config-file seconds # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { - echo $1 `rcuperf_param_nreaders "$1"` \ - `rcuperf_param_nwriters "$1"` \ - rcuperf.shutdown=1 \ + echo $1 rcuperf.shutdown=1 \ rcuperf.verbose=1 } diff --git a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt index 66efb59a1bd1..449cf579d6f9 100644 --- a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt +++ b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt @@ -1,4 +1,4 @@ -This document describes one way to created the rcu-test-image file +This document describes one way to create the rcu-test-image file that contains the filesystem used by the guest-OS kernel. There are probably much better ways of doing this, and this filesystem could no doubt be smaller. It is probably also possible to simply download |