From 8a6b173287bb94b3ef8360119020e856afb1c934 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Mar 2014 18:56:22 +0200 Subject: uprobes: Kill UPROBE_SKIP_SSTEP and can_skip_sstep() UPROBE_COPY_INSN, UPROBE_SKIP_SSTEP, and uprobe->flags must die. This patch kills UPROBE_SKIP_SSTEP. I never understood why it was added; not only it doesn't help, it harms. It can only help to avoid arch_uprobe_skip_sstep() if it was already called before and failed. But this is ugly, if we want to know whether we can emulate this instruction or not we should do this analysis in arch_uprobe_analyze_insn(), not when we hit this probe for the first time. And in fact this logic is simply wrong. arch_uprobe_skip_sstep() can fail or not depending on the task/register state, if this insn can be emulated but, say, put_user() fails we need to xol it this time, but this doesn't mean we shouldn't try to emulate it when this or another thread hits this bp next time. And this is the actual reason for this change. We need to emulate the "call" insn, but push(return-address) can obviously fail. Per-arch notes: x86: __skip_sstep() can only emulate "rep;nop". With this change it will be called every time and most probably for no reason. This will be fixed by the next changes. We need to change this suboptimal code anyway. arm: Should not be affected. It has its own "bool simulate" flag checked in arch_uprobe_skip_sstep(). ppc: Looks like, it can emulate almost everything. Does it actually need to record the fact that emulate_step() failed? Hopefully not. But if yes, it can add the ppc- specific flag into arch_uprobe. TODO: rename arch_uprobe_skip_sstep() to arch_uprobe_emulate_insn(), Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: David A. Long Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..ea2a7c0728bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { kfree(uprobe); @@ -1628,20 +1623,6 @@ bool uprobe_deny_signal(void) return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1868,13 +1849,13 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } -- cgit v1.2.3 From 014940bad8e46ca7bd0483f760f9cba60088a3d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:20:10 +0200 Subject: uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails Currently the error from arch_uprobe_post_xol() is silently ignored. This doesn't look good and this can lead to the hard-to-debug problems. 1. Change handle_singlestep() to loudly complain and send SIGILL. Note: this only affects x86, ppc/arm can't fail. 2. Change arch_uprobe_post_xol() to call arch_uprobe_abort_xol() and avoid TF games if it is going to return an error. This can help to to analyze the problem, if nothing else we should not report ->ip = xol_slot in the core-file. Note: this means that handle_riprel_post_xol() can be called twice, but this is fine because it is idempotent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 16 ++++++++++++---- kernel/events/uprobes.c | 8 +++++++- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 08cdb82815fe..e72903eacd43 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -594,6 +594,15 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -605,8 +614,6 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - if (auprobe->ops->post_xol) - return auprobe->ops->post_xol(auprobe, regs); return 0; } @@ -641,8 +648,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2a7c0728bb..d1edc5e6fd03 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1867,10 +1867,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1884,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* -- cgit v1.2.3 From febdbfe8a91ce0d11939d4940b592eb0dba8d663 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Feb 2014 18:16:07 +0100 Subject: arch: Prepare for smp_mb__{before,after}_atomic() Since the smp_mb__{before,after}*() ops are fundamentally dependent on how an arch can implement atomics it doesn't make sense to have 3 variants of them. They must all be the same. Furthermore, the 3 variants suggest they're only valid for those 3 atomic ops, while we have many more where they could be applied. So move away from smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() and reduce the interface to just the two: smp_mb__{before,after}_atomic(). This patch prepares the way by introducing default implementations in asm-generic/barrier.h that default to a full barrier and providing __deprecated inlines for the previous 6 barriers if they're not provided by the arch. This should allow for a mostly painless transition (lots of deprecated warns in the interim). Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-wr59327qdyi9mbzn6x937s4e@git.kernel.org Cc: Arnd Bergmann Cc: "Chen, Gong" Cc: John Sullivan Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Srinivas Pandruvada Cc: "Theodore Ts'o" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/atomic.h | 7 +------ include/asm-generic/barrier.h | 8 ++++++++ include/asm-generic/bitops.h | 9 +-------- include/linux/atomic.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/bitops.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 16 ++++++++++++++++ 6 files changed, 82 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 33bd2de3bc1e..9c79e7603459 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -16,6 +16,7 @@ #define __ASM_GENERIC_ATOMIC_H #include +#include #ifdef CONFIG_SMP /* Force people to define core atomics */ @@ -182,11 +183,5 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v) } #endif -/* Assume that atomic operations are already serializing */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #endif /* __KERNEL__ */ #endif /* __ASM_GENERIC_ATOMIC_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 6f692f8ac664..1402fa855388 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -62,6 +62,14 @@ #define set_mb(var, value) do { (var) = (value); mb(); } while (0) #endif +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic() smp_mb() +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic() smp_mb() +#endif + #define smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index 280ca7a96f75..dcdcacf2fd2b 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -11,14 +11,7 @@ #include #include - -/* - * clear_bit may not imply a memory barrier - */ -#ifndef smp_mb__before_clear_bit -#define smp_mb__before_clear_bit() smp_mb() -#define smp_mb__after_clear_bit() smp_mb() -#endif +#include #include #include diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 5b08a8540ecf..fef3a809e7cf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -3,6 +3,42 @@ #define _LINUX_ATOMIC_H #include +/* + * Provide __deprecated wrappers for the new interface, avoid flag day changes. + * We need the ugly external functions to break header recursion hell. + */ +#ifndef smp_mb__before_atomic_inc +static inline void __deprecated smp_mb__before_atomic_inc(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_atomic_inc +static inline void __deprecated smp_mb__after_atomic_inc(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + +#ifndef smp_mb__before_atomic_dec +static inline void __deprecated smp_mb__before_atomic_dec(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_atomic_dec +static inline void __deprecated smp_mb__after_atomic_dec(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + /** * atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t diff --git a/include/linux/bitops.h b/include/linux/bitops.h index be5fd38bd5a0..cbc5833fb221 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,6 +32,26 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include +/* + * Provide __deprecated wrappers for the new interface, avoid flag day changes. + * We need the ugly external functions to break header recursion hell. + */ +#ifndef smp_mb__before_clear_bit +static inline void __deprecated smp_mb__before_clear_bit(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_clear_bit +static inline void __deprecated smp_mb__after_clear_bit(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + #define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..8a70ec091760 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,6 +90,22 @@ #define CREATE_TRACE_POINTS #include +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; -- cgit v1.2.3 From cadefd3d6cc914d95163ba1eda766bfe7ce1e5b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Feb 2014 10:40:35 +0100 Subject: sched: Make scale_rt_power() deal with backward clocks Mike reported that, while unlikely, its entirely possible for scale_rt_power() to see the time go backwards. This yields rather 'interesting' results. So like all other sites that deal with clocks; make this one ignore backward clock movement too. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140227094035.GZ9987@twins.programming.kicks-ass.net Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..5e157f157d85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5564,6 +5564,7 @@ static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5572,7 +5573,11 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ -- cgit v1.2.3 From 10447917551e0fffb8d1892d46e633c3e0a9c1ec Mon Sep 17 00:00:00 2001 From: Kirill V Tkhai Date: Wed, 12 Mar 2014 06:18:33 -0400 Subject: sched/rt: Do not try to push tasks if pinned task switches to RT Just switched pinned task is not able to be pushed. If the rq had had several RT tasks before they have already been considered as candidates to be pushed (or pulled). Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140312061833.3a43aa64@gandalf.local.home Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd2267ad404f..1e4992eb5166 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1892,9 +1892,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ - rq != task_rq(p)) + push_rt_task(rq) && rq != task_rq(p)) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) -- cgit v1.2.3 From 8698a745d800c59cd5a576398bdeccd578ac66f1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Mar 2014 18:09:12 +0800 Subject: sched, treewide: Replace hardcoded nice values with MIN_NICE/MAX_NICE Replace various -20/+19 hardcoded nice values with MIN_NICE/MAX_NICE. Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ff13819fd09b7a5dba5ab5ae797f2e7019bdfa17.1394532288.git.yangds.fnst@cn.fujitsu.com Cc: devel@driverdev.osuosl.org Cc: devicetree@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Cc: linux390@de.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: nbd-general@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: openipmi-developer@lists.sourceforge.net Cc: qla2xxx-upstream@qlogic.com Cc: linux-arch@vger.kernel.org [ Consolidated the patches, twiddled the changelog. ] Signed-off-by: Ingo Molnar --- drivers/block/loop.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/char/ipmi/ipmi_si_intf.c | 2 +- drivers/s390/crypto/ap_bus.c | 2 +- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 4 ++-- drivers/scsi/bnx2i/bnx2i_hwi.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- drivers/scsi/ibmvscsi/ibmvfc.c | 2 +- drivers/scsi/ibmvscsi/ibmvscsi.c | 2 +- drivers/scsi/lpfc/lpfc_hbadisc.c | 2 +- drivers/scsi/qla2xxx/qla_os.c | 2 +- drivers/staging/android/binder.c | 2 +- drivers/staging/lustre/lustre/llite/lloop.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- kernel/locking/locktorture.c | 2 +- kernel/workqueue.c | 6 +++--- mm/huge_memory.c | 2 +- 18 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 66e8c3b94ef3..c8bf270b7890 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -548,7 +548,7 @@ static int loop_thread(void *data) struct loop_device *lo = data; struct bio *bio; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db36b2d..2a1f26bd6640 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -533,7 +533,7 @@ static int nbd_thread(void *data) struct nbd_device *nbd = data; struct request *req; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ wait_event_interruptible(nbd->waiting_wq, diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a2af73db187b..ef166ad2dbad 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar) struct packet_data *pkt; long min_sleep_time, residue; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_freezable(); for (;;) { diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index b7efd3c1a882..0f20d3646a46 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -998,7 +998,7 @@ static int ipmi_thread(void *data) struct timespec busy_until; ipmi_si_set_not_busy(&busy_until); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { int busy_wait; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ab3baa7f9508..8eec1653c9cc 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data) int requests; struct ap_device *ap_dev; - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (1) { if (ap_suspend_flag) return 0; diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 6287f6a8b79d..3455cc5e4bfd 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg) struct fcoe_percpu_s *bg = arg; struct sk_buff *skb; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); @@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg) struct bnx2fc_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index b5ffd280a1ae..d6d491c2f004 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg) struct bnx2i_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop()) { spin_lock_bh(&p->p_work_lock); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index f3170008ae71..843a679d2a5e 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg) skb_queue_head_init(&tmp); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); retry: while (!kthread_should_stop()) { diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 23f5ba5e6472..8dd47689d584 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data) struct ibmvfc_host *vhost = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(vhost->work_wait_q, diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index fa764406df68..2ebfb2bb0f42 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data) struct ibmvscsi_host_data *hostdata = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(hostdata->work_wait_q, diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 59b51c529ba0..294c072e9083 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -731,7 +731,7 @@ lpfc_do_work(void *p) struct lpfc_hba *phba = p; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); current->flags |= PF_NOFREEZE; phba->data_flags = 0; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 19e99cc33724..afc84814e9bb 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data) ha = (struct qla_hw_data *)data; base_vha = pci_get_drvdata(ha->pdev); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index cfe4bc8f05cb..179b21b66504 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -441,7 +441,7 @@ static void binder_set_nice(long nice) "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); - if (min_nice < 20) + if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index f78eda235c7a..d4c2cd91add4 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -407,7 +407,7 @@ static int loop_thread(void *data) int refcheck; int ret = 0; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); lo->lo_state = LLOOP_BOUND; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bf482dfed14f..73039295d0d1 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ o2nm_depend_this_node(); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..23343be46e91 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg) static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..c30c01b32ece 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -100,10 +100,10 @@ enum { /* * Rescue workers are used only on emergencies and shared by - * all cpus. Give -20. + * all cpus. Give MIN_NICE. */ - RESCUER_NICE_LEVEL = -20, - HIGHPRI_NICE_LEVEL = -20, + RESCUER_NICE_LEVEL = MIN_NICE, + HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d78..dcdb6f9adea1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2803,7 +2803,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); -- cgit v1.2.3 From 22abdef37cebcdd4933c72339401a174b7d87768 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:49 +0400 Subject: sched/rt: Sum number of all children tasks in hierarhy at ->rt_nr_running {inc,dec}_rt_tasks() used to count entities which are directly queued on the rt_rq. If an entity was not a task (i.e., it is some queue), its children were not counted. There is no problem here, but now we want to count number of all tasks which are actually queued under the rt_rq in all the hierarchy (except throttled rt queues). Empty queues are not able to be queued and all of the places, which use ->rt_nr_running, just compare it with zero, so we do not break anything here. Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835289.18748.31.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org [ Twiddled the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e4992eb5166..6892ed7d644b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1044,13 +1044,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ +static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1062,7 +1073,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); -- cgit v1.2.3 From 653d07a6989a9a4166dcd1025aa252b3605737fd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:55 +0400 Subject: sched/rt: Add accessors rq_of_rt_se() Two accessors for RT_GROUP_SCHED and !RT_GROUP_SCHED cases. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835295.18748.32.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6892ed7d644b..f6aa3cdbee84 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -112,6 +112,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_se->rt_rq; + + return rt_rq->rq; +} + void free_rt_sched_group(struct task_group *tg) { int i; @@ -211,10 +218,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } -- cgit v1.2.3 From f4ebcbc0d7e009783256c9daf76bc4b90e645c14 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:00 +0400 Subject: sched/rt: Substract number of tasks of throttled queues from rq->nr_running Now rq->rt becomes to be able to be in dequeued or enqueued state. We add new member rt_rq->rt_queued, which is used to indicate this. The member is used only for top queue rq->rt_rq. The goal is to fit generic scheme which is used in deadline and fair classes, i.e. throttled rt_rq's rt_nr_running is beeing substracted from rq->nr_running. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835300.18748.33.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 73 +++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 2 ++ 2 files changed, 63 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f6aa3cdbee84..2add019ddbd0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; @@ -404,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq) } #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); + static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -465,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -479,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) + if (!rt_se) + dequeue_top_rt_rq(rt_rq); + else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); } @@ -545,12 +555,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_task(rq->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { + dequeue_top_rt_rq(rt_rq); } static inline const struct cpumask *sched_rt_period_mask(void) @@ -935,6 +951,38 @@ static void update_curr_rt(struct rq *rq) } } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + rq->nr_running -= rt_rq->rt_nr_running; + rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + return; + + rq->nr_running += rt_rq->rt_nr_running; + rt_rq->rt_queued = 1; +} + #if defined CONFIG_SMP static void @@ -1143,6 +1191,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) back = rt_se; } + dequeue_top_rt_rq(rt_rq_of_se(back)); + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); @@ -1151,13 +1201,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); + enqueue_top_rt_rq(&rq->rt); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) { @@ -1166,6 +1221,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1183,8 +1239,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1195,8 +1249,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_nr_running(rq); } /* @@ -1401,10 +1453,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) + if (!rt_rq->rt_queued) return NULL; put_prev_task(rq, prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..c8d9ee418ca7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,8 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + int rt_queued; + int rt_throttled; u64 rt_time; u64 rt_runtime; -- cgit v1.2.3 From 46383648b3c769fa74794ae6425ab993fc113bdb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:07 +0400 Subject: sched: Revert commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()") This reverts commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()"), which is not necessary after ("sched/rt: Substract number of tasks of throttled queues from rq->nr_running"). Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy [conflict resolution with stop task checking patch] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835307.18748.34.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- kernel/sched/rt.c | 10 ++++++++++ kernel/sched/sched.h | 12 ------------ 3 files changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e157f157d85..43232b8bacde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6732,10 +6732,7 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running && - ((this_rq->stop && this_rq->stop->on_rq) || - this_rq->dl.dl_nr_running || - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; if (pulled_task) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2add019ddbd0..7795e292f4c9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -493,6 +493,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -569,6 +574,11 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_top_rt_rq(rt_rq); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} + static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c8d9ee418ca7..b2cbe81308af 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -425,18 +425,6 @@ struct rt_rq { #endif }; -#ifdef CONFIG_RT_GROUP_SCHED -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} -#else -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} -#endif - /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v1.2.3 From c464c76eec4be587604ca082e8cded7e6b89f3bf Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:41 +0800 Subject: perf: Allow building PMU drivers as modules This patch adds support for building PMU driver as module. It exports the functions perf_pmu_{register,unregister}() and adds reference tracking for the PMU driver module. When the PMU driver is built as a module, each active event of the PMU holds a reference to the driver module. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-1-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3356abcfff18..af6dcf1d9e47 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -172,6 +172,7 @@ struct perf_event; struct pmu { struct list_head entry; + struct module *module; struct device *dev; const struct attribute_group **attr_groups; const char *name; diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..5129b1201050 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "internal.h" @@ -3229,6 +3230,9 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->pmu) + module_put(event->pmu->module); + call_rcu(&event->rcu_head, free_event_rcu); } static void free_event(struct perf_event *event) @@ -6551,6 +6555,7 @@ free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } +EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { @@ -6572,6 +6577,7 @@ void perf_pmu_unregister(struct pmu *pmu) put_device(pmu->dev); free_pmu_context(pmu); } +EXPORT_SYMBOL_GPL(perf_pmu_unregister); struct pmu *perf_init_event(struct perf_event *event) { @@ -6585,6 +6591,10 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (ret) @@ -6593,6 +6603,10 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (!ret) @@ -6771,6 +6785,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, err_pmu: if (event->destroy) event->destroy(event); + module_put(pmu->module); err_ns: if (event->ns) put_pid_ns(event->ns); -- cgit v1.2.3 From 8588a2bbddc524325d84d1d6996758e9242e4ffc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:42 +0800 Subject: hrtimer: Export __hrtimer_start_range_ns() Export __hrtimer_start_range_ns() to allow building perf Intel uncore driver as a module. Signed-off-by: Yan, Zheng Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-2-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee29..53d26829cd4d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1017,6 +1017,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU -- cgit v1.2.3 From 4e857c58efeb99393cba5a5d0d8ec7117183137c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Mar 2014 18:06:10 +0100 Subject: arch: Mass conversion of smp_mb__*() Mostly scripted conversion of the smp_mb__* barriers. Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-55dhyhocezdw1dg7u19hmh1u@git.kernel.org Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Signed-off-by: Ingo Molnar --- block/blk-iopoll.c | 4 +- crypto/chainiv.c | 2 +- drivers/base/power/domain.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 4 +- drivers/cpuidle/coupled.c | 2 +- drivers/firewire/ohci.c | 2 +- drivers/gpu/drm/drm_irq.c | 10 ++-- drivers/gpu/drm/i915/i915_irq.c | 2 +- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/closure.h | 2 +- drivers/md/dm-bufio.c | 8 ++-- drivers/md/dm-snap.c | 4 +- drivers/md/dm.c | 2 +- drivers/md/raid5.c | 2 +- drivers/media/usb/dvb-usb-v2/dvb_usb_core.c | 6 +-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 6 +-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 18 ++++---- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c | 26 +++++------ drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 8 ++-- drivers/net/ethernet/broadcom/cnic.c | 8 ++-- drivers/net/ethernet/brocade/bna/bnad.c | 6 +-- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 2 +- drivers/net/ethernet/chelsio/cxgb3/sge.c | 6 +-- drivers/net/ethernet/chelsio/cxgb4/sge.c | 2 +- drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 8 ++-- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++-- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +-- drivers/net/wireless/ti/wlcore/main.c | 2 +- drivers/pci/xen-pcifront.c | 4 +- drivers/scsi/isci/remote_device.c | 2 +- drivers/target/loopback/tcm_loop.c | 4 +- drivers/target/target_core_alua.c | 26 +++++------ drivers/target/target_core_device.c | 6 +-- drivers/target/target_core_iblock.c | 2 +- drivers/target/target_core_pr.c | 56 +++++++++++------------ drivers/target/target_core_transport.c | 16 +++---- drivers/target/target_core_ua.c | 10 ++-- drivers/tty/n_tty.c | 2 +- drivers/tty/serial/mxs-auart.c | 4 +- drivers/usb/gadget/tcm_usb_gadget.c | 4 +- drivers/usb/serial/usb_wwan.c | 2 +- drivers/vhost/scsi.c | 2 +- drivers/w1/w1_family.c | 4 +- drivers/xen/xen-pciback/pciback_ops.c | 4 +- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 6 +-- fs/btrfs/ioctl.c | 2 +- fs/buffer.c | 2 +- fs/ext4/resize.c | 2 +- fs/gfs2/glock.c | 8 ++-- fs/gfs2/glops.c | 2 +- fs/gfs2/lock_dlm.c | 4 +- fs/gfs2/recovery.c | 2 +- fs/gfs2/sys.c | 4 +- fs/jbd2/commit.c | 6 +-- fs/nfs/dir.c | 12 ++--- fs/nfs/inode.c | 2 +- fs/nfs/nfs4filelayoutdev.c | 4 +- fs/nfs/nfs4state.c | 4 +- fs/nfs/pagelist.c | 6 +-- fs/nfs/pnfs.c | 2 +- fs/nfs/pnfs.h | 2 +- fs/nfs/write.c | 4 +- fs/ubifs/lpt_commit.c | 4 +- fs/ubifs/tnc_commit.c | 4 +- include/asm-generic/bitops/atomic.h | 2 +- include/asm-generic/bitops/lock.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/genhd.h | 2 +- include/linux/interrupt.h | 8 ++-- include/linux/netdevice.h | 2 +- include/linux/sched.h | 6 +-- include/linux/sunrpc/sched.h | 8 ++-- include/linux/sunrpc/xprt.h | 8 ++-- include/linux/tracehook.h | 2 +- include/net/ip_vs.h | 4 +- kernel/debug/debug_core.c | 4 +- kernel/futex.c | 4 +- kernel/kmod.c | 2 +- kernel/rcu/tree.c | 22 ++++----- kernel/rcu/tree_plugin.h | 8 ++-- kernel/sched/cpupri.c | 6 +-- kernel/sched/wait.c | 2 +- mm/backing-dev.c | 2 +- mm/filemap.c | 4 +- net/atm/pppoatm.c | 2 +- net/bluetooth/hci_event.c | 4 +- net/core/dev.c | 8 ++-- net/core/link_watch.c | 2 +- net/ipv4/inetpeer.c | 2 +- net/ipv4/tcp_output.c | 4 +- net/netfilter/nf_conntrack_core.c | 2 +- net/rds/ib_recv.c | 4 +- net/rds/iw_recv.c | 4 +- net/rds/send.c | 6 +-- net/rds/tcp_send.c | 2 +- net/sunrpc/auth.c | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/backchannel_rqst.c | 4 +- net/sunrpc/xprt.c | 4 +- net/sunrpc/xprtsock.c | 16 +++---- net/unix/af_unix.c | 2 +- sound/pci/bt87x.c | 4 +- 106 files changed, 284 insertions(+), 288 deletions(-) (limited to 'kernel') diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..f8c6a11b13f0 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(blk_iopoll_sched); void __blk_iopoll_complete(struct blk_iopoll *iop) { list_del(&iop->list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(__blk_iopoll_complete); @@ -161,7 +161,7 @@ EXPORT_SYMBOL(blk_iopoll_disable); void blk_iopoll_enable(struct blk_iopoll *iop) { BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(blk_iopoll_enable); diff --git a/crypto/chainiv.c b/crypto/chainiv.c index 834d8dd3d4fc..9c294c8f9a07 100644 --- a/crypto/chainiv.c +++ b/crypto/chainiv.c @@ -126,7 +126,7 @@ static int async_chainiv_schedule_work(struct async_chainiv_ctx *ctx) int err = ctx->err; if (!ctx->queue.qlen) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(CHAINIV_STATE_INUSE, &ctx->state); if (!ctx->queue.qlen || diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index ae098a261fcd..eee55c1e5fde 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -105,7 +105,7 @@ static bool genpd_sd_counter_dec(struct generic_pm_domain *genpd) static void genpd_sd_counter_inc(struct generic_pm_domain *genpd) { atomic_inc(&genpd->sd_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static void genpd_acquire_lock(struct generic_pm_domain *genpd) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 59c5abe32f06..4fd8d6c1c3d2 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -224,9 +224,9 @@ static int get_slot(struct mtip_port *port) */ static inline void release_slot(struct mtip_port *port, int tag) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(tag, port->allocated); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } /* diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index cb6654bfad77..73fe2f8d7f96 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -159,7 +159,7 @@ void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) { int n = dev->coupled->online_count; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(a); while (atomic_read(a) < n) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 8db663219560..995dd42a2627 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -3498,7 +3498,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) } clear_bit_unlock(0, &ctx->flushing_completions); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } tasklet_enable(&ctx->context.tasklet); diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index c2676b5908d9..ec5c3f4cdd01 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -156,7 +156,7 @@ static void vblank_disable_and_save(struct drm_device *dev, int crtc) */ if ((vblrc > 0) && (abs64(diff_ns) > 1000000)) { atomic_inc(&dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /* Invalidate all timestamps while vblank irq's are off. */ @@ -864,9 +864,9 @@ static void drm_update_vblank_count(struct drm_device *dev, int crtc) vblanktimestamp(dev, crtc, tslot) = t_vblank; } - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(diff, &dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /** @@ -1330,9 +1330,9 @@ bool drm_handle_vblank(struct drm_device *dev, int crtc) /* Increment cooked vblank count. This also atomically commits * the timestamp computed above. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } else { DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n", crtc, (int) diff_ns); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 7753249b3a95..5409bfafff63 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2147,7 +2147,7 @@ static void i915_error_work_func(struct work_struct *work) * updates before * the counter increment. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&dev_priv->gpu_error.reset_counter); kobject_uevent_env(&dev->primary->kdev->kobj, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 82c9c5d35251..d2ebcf323094 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -828,7 +828,7 @@ static inline bool cached_dev_get(struct cached_dev *dc) return false; /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return true; } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 7ef7461912be..a08e3eeac3c5 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, cl->fn = fn; cl->wq = wq; /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } static inline void closure_queue(struct closure *cl) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 66c5d130c8c2..4e84095833db 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -607,9 +607,9 @@ static void write_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_WRITING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_WRITING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_WRITING); } @@ -997,9 +997,9 @@ static void read_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_READING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_READING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_READING); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ebddef5237e4..8e0caed0bf74 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -642,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) struct dm_snapshot *s = pe->snap; mempool_free(pe, s->pending_pool); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&s->pending_exceptions_count); } @@ -783,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s) static void merge_shutdown(struct dm_snapshot *s) { clear_bit_unlock(RUNNING_MERGE, &s->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&s->state_bits, RUNNING_MERGE); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e64916498..2db768e4553f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2447,7 +2447,7 @@ static void dm_wq_work(struct work_struct *work) static void dm_queue_flush(struct mapped_device *md) { clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); queue_work(md->wq, &md->work); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ad1b9bea446e..2afef4ec9312 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4400,7 +4400,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_UNPLUG_LIST clear but the stripe * is still in our list */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); /* * STRIPE_ON_RELEASE_LIST could be set here. In that diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c index de02db802ace..e35580618936 100644 --- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c +++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c @@ -399,7 +399,7 @@ static int dvb_usb_stop_feed(struct dvb_demux_feed *dvbdmxfeed) /* clear 'streaming' status bit */ clear_bit(ADAP_STREAMING, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_STREAMING); skip_feed_stop: @@ -550,7 +550,7 @@ static int dvb_usb_fe_init(struct dvb_frontend *fe) err: if (!adap->suspend_resume_active) { clear_bit(ADAP_INIT, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_INIT); } @@ -591,7 +591,7 @@ err: if (!adap->suspend_resume_active) { adap->active_fe = -1; clear_bit(ADAP_SLEEP, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_SLEEP); } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 9261d5313b5b..dd57c7c5a3da 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2781,7 +2781,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode) case LOAD_OPEN: netif_tx_start_all_queues(bp->dev); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case LOAD_DIAG: @@ -4939,9 +4939,9 @@ void bnx2x_update_coalesce_sb_index(struct bnx2x *bp, u8 fw_sb_id, void bnx2x_schedule_sp_rtnl(struct bnx2x *bp, enum sp_rtnl_flag flag, u32 verbose) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(flag, &bp->sp_rtnl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); DP((BNX2X_MSG_SP | verbose), "Scheduling sp_rtnl task [Flag: %d]\n", flag); schedule_delayed_work(&bp->sp_rtnl_task, 0); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index a78edaccceee..16391db2e8c9 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -1858,10 +1858,10 @@ void bnx2x_sp_event(struct bnx2x_fastpath *fp, union eth_rx_cqe *rr_cqe) return; #endif - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&bp->cq_spq_left); /* push the change in bp->spq_left and towards the memory */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); DP(BNX2X_MSG_SP, "bp->cq_spq_left %x\n", atomic_read(&bp->cq_spq_left)); @@ -1876,11 +1876,11 @@ void bnx2x_sp_event(struct bnx2x_fastpath *fp, union eth_rx_cqe *rr_cqe) * sp_state is cleared, and this order prevents * races */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(BNX2X_AFEX_PENDING_VIFSET_MCP_ACK, &bp->sp_state); wmb(); clear_bit(BNX2X_AFEX_FCOE_Q_UPDATE_PENDING, &bp->sp_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* schedule the sp task as mcp ack is required */ bnx2x_schedule_sp_task(bp); @@ -5272,9 +5272,9 @@ static void bnx2x_after_function_update(struct bnx2x *bp) __clear_bit(RAMROD_COMP_WAIT, &queue_params.ramrod_flags); /* mark latest Q bit */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(BNX2X_AFEX_FCOE_Q_UPDATE_PENDING, &bp->sp_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* send Q update ramrod for FCoE Q */ rc = bnx2x_queue_state_change(bp, &queue_params); @@ -5500,7 +5500,7 @@ next_spqe: spqe_cnt++; } /* for */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(spqe_cnt, &bp->eq_spq_left); bp->eq_cons = sw_cons; @@ -13869,9 +13869,9 @@ static int bnx2x_drv_ctl(struct net_device *dev, struct drv_ctl_info *ctl) case DRV_CTL_RET_L2_SPQ_CREDIT_CMD: { int count = ctl->data.credit.credit_count; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(count, &bp->cq_spq_left); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); break; } case DRV_CTL_ULP_REGISTER_CMD: { diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c index 31297266b743..d725317c4277 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c @@ -258,16 +258,16 @@ static bool bnx2x_raw_check_pending(struct bnx2x_raw_obj *o) static void bnx2x_raw_clear_pending(struct bnx2x_raw_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(o->state, o->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_raw_set_pending(struct bnx2x_raw_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(o->state, o->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } /** @@ -2131,7 +2131,7 @@ static int bnx2x_set_rx_mode_e1x(struct bnx2x *bp, /* The operation is completed */ clear_bit(p->state, p->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -3576,16 +3576,16 @@ error_exit1: static void bnx2x_mcast_clear_sched(struct bnx2x_mcast_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(o->sched_state, o->raw.pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_mcast_set_sched(struct bnx2x_mcast_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(o->sched_state, o->raw.pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static bool bnx2x_mcast_check_sched(struct bnx2x_mcast_obj *o) @@ -4200,7 +4200,7 @@ int bnx2x_queue_state_change(struct bnx2x *bp, if (rc) { o->next_state = BNX2X_Q_STATE_MAX; clear_bit(pending_bit, pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return rc; } @@ -4288,7 +4288,7 @@ static int bnx2x_queue_comp_cmd(struct bnx2x *bp, wmb(); clear_bit(cmd, &o->pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -5279,7 +5279,7 @@ static inline int bnx2x_func_state_change_comp(struct bnx2x *bp, wmb(); clear_bit(cmd, &o->pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -5926,7 +5926,7 @@ int bnx2x_func_state_change(struct bnx2x *bp, if (rc) { o->next_state = BNX2X_F_STATE_MAX; clear_bit(cmd, pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return rc; } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 5c523b32db70..f82ac5ac2336 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1626,9 +1626,9 @@ static void bnx2x_vf_handle_filters_eqe(struct bnx2x *bp, struct bnx2x_virtf *vf) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNX2X_FILTER_RX_MODE_PENDING, &vf->filter_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_vf_handle_rss_update_eqe(struct bnx2x *bp, @@ -2960,9 +2960,9 @@ void bnx2x_iov_task(struct work_struct *work) void bnx2x_schedule_iov_task(struct bnx2x *bp, enum bnx2x_iov_flag flag) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(flag, &bp->iov_task_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); DP(BNX2X_MSG_IOV, "Scheduling iov task [Flag: %d]\n", flag); queue_delayed_work(bnx2x_iov_wq, &bp->iov_task, 0); } diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 09f3fefcbf9c..4dd48d2fa804 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -436,7 +436,7 @@ static int cnic_offld_prep(struct cnic_sock *csk) static int cnic_close_prep(struct cnic_sock *csk) { clear_bit(SK_F_CONNECT_START, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (test_and_clear_bit(SK_F_OFFLD_COMPLETE, &csk->flags)) { while (test_and_set_bit(SK_F_OFFLD_SCHED, &csk->flags)) @@ -450,7 +450,7 @@ static int cnic_close_prep(struct cnic_sock *csk) static int cnic_abort_prep(struct cnic_sock *csk) { clear_bit(SK_F_CONNECT_START, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); while (test_and_set_bit(SK_F_OFFLD_SCHED, &csk->flags)) msleep(1); @@ -3646,7 +3646,7 @@ static int cnic_cm_destroy(struct cnic_sock *csk) csk_hold(csk); clear_bit(SK_F_INUSE, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); while (atomic_read(&csk->ref_count) != 1) msleep(1); cnic_cm_cleanup(csk); @@ -4026,7 +4026,7 @@ static void cnic_cm_process_kcqe(struct cnic_dev *dev, struct kcqe *kcqe) L4_KCQE_COMPLETION_STATUS_PARITY_ERROR) set_bit(SK_F_HW_ERR, &csk->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(SK_F_OFFLD_SCHED, &csk->flags); cnic_cm_upcall(cp, csk, opcode); break; diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index 675550fe8ee9..3a77f9ead004 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -249,7 +249,7 @@ bnad_tx_complete(struct bnad *bnad, struct bna_tcb *tcb) if (likely(test_bit(BNAD_TXQ_TX_STARTED, &tcb->flags))) bna_ib_ack(tcb->i_dbell, sent); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); return sent; @@ -1126,7 +1126,7 @@ bnad_tx_cleanup(struct delayed_work *work) bnad_txq_cleanup(bnad, tcb); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); } @@ -2992,7 +2992,7 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) sent = bnad_txcmpl_process(bnad, tcb); if (likely(test_bit(BNAD_TXQ_TX_STARTED, &tcb->flags))) bna_ib_ack(tcb->i_dbell, sent); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); } else { netif_stop_queue(netdev); diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 0fe7ff750d77..05613a85ce61 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -281,7 +281,7 @@ static int cxgb_close(struct net_device *dev) if (adapter->params.stats_update_period && !(adapter->open_device_map & PORT_MASK)) { /* Stop statistics accumulation. */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_lock(&adapter->work_lock); /* sync with update task */ spin_unlock(&adapter->work_lock); cancel_mac_stats_update(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index 8b069f96e920..3dfcf600fcc6 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -1379,7 +1379,7 @@ static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q, struct sge_qset *qs = txq_to_qset(q, qid); set_bit(qid, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(qid, &qs->txq_stopped)) @@ -1492,7 +1492,7 @@ static void restart_ctrlq(unsigned long data) if (!skb_queue_empty(&q->sendq)) { set_bit(TXQ_CTRL, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) @@ -1697,7 +1697,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK); if (unlikely(q->size - q->in_use < ndesc)) { set_bit(TXQ_OFLD, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index ca95cf2954eb..e249528c8e60 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2031,7 +2031,7 @@ static void sge_rx_timer_cb(unsigned long data) struct sge_fl *fl = s->egr_map[id]; clear_bit(id, s->starving_fl); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (fl_starving(fl)) { rxq = container_of(fl, struct sge_eth_rxq, fl); diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index 9cfa4b4bb089..9d88c1d50b49 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c @@ -1951,7 +1951,7 @@ static void sge_rx_timer_cb(unsigned long data) struct sge_fl *fl = s->egr_map[id]; clear_bit(id, s->starving_fl); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * Since we are accessing fl without a lock there's a diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 9125d9abf099..d82f092cae90 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1797,9 +1797,9 @@ void stop_gfar(struct net_device *dev) netif_tx_stop_all_queues(dev); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(GFAR_DOWN, &priv->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); disable_napi(priv); @@ -2042,9 +2042,9 @@ int startup_gfar(struct net_device *ndev) gfar_init_tx_rx_base(priv); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GFAR_DOWN, &priv->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* Start Rx/Tx DMA and enable the interrupts */ gfar_start(priv); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 861b722c2672..1e526c072a44 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4671,7 +4671,7 @@ static void i40e_service_event_complete(struct i40e_pf *pf) BUG_ON(!test_bit(__I40E_SERVICE_SCHED, &pf->state)); /* flush memory to make sure state is correct before next watchog */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__I40E_SERVICE_SCHED, &pf->state); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c4c526b7f99f..2fecc2626de5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -376,7 +376,7 @@ static void ixgbe_service_event_complete(struct ixgbe_adapter *adapter) BUG_ON(!test_bit(__IXGBE_SERVICE_SCHED, &adapter->state)); /* flush memory to make sure state is correct before next watchdog */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_SERVICE_SCHED, &adapter->state); } @@ -4671,7 +4671,7 @@ static void ixgbe_up_complete(struct ixgbe_adapter *adapter) if (hw->mac.ops.enable_tx_laser) hw->mac.ops.enable_tx_laser(hw); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DOWN, &adapter->state); ixgbe_napi_enable_all(adapter); @@ -5567,7 +5567,7 @@ static int ixgbe_resume(struct pci_dev *pdev) e_dev_err("Cannot enable PCI device from suspend\n"); return err; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DISABLED, &adapter->state); pci_set_master(pdev); @@ -8541,7 +8541,7 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev) e_err(probe, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DISABLED, &adapter->state); adapter->hw.hw_addr = adapter->io_addr; pci_set_master(pdev); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index d0799e8e31e4..de2793b06305 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1668,7 +1668,7 @@ static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) spin_unlock_bh(&adapter->mbx_lock); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DOWN, &adapter->state); ixgbevf_napi_enable_all(adapter); @@ -3354,7 +3354,7 @@ static int ixgbevf_resume(struct pci_dev *pdev) dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n"); return err; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DISABLED, &adapter->state); pci_set_master(pdev); @@ -3712,7 +3712,7 @@ static pci_ers_result_t ixgbevf_io_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_DISCONNECT; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DISABLED, &adapter->state); pci_set_master(pdev); diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index ed88d3913483..e71eae353368 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -543,7 +543,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) * wl1271_ps_elp_wakeup cannot be called concurrently. */ clear_bit(WL1271_FLAG_IRQ_RUNNING, &wl->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); ret = wlcore_fw_status(wl, wl->fw_status); if (ret < 0) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 179b8edc2262..53df39a22c8a 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -662,9 +662,9 @@ static void pcifront_do_aer(struct work_struct *data) notify_remote_via_evtchn(pdev->evtchn); /*in case of we lost an aer request in four lines time_window*/ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(_PDEVB_op_active, &pdev->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); schedule_pcifront_aer_op(pdev); diff --git a/drivers/scsi/isci/remote_device.c b/drivers/scsi/isci/remote_device.c index 96a26f454673..cc51f38b116d 100644 --- a/drivers/scsi/isci/remote_device.c +++ b/drivers/scsi/isci/remote_device.c @@ -1541,7 +1541,7 @@ void isci_remote_device_release(struct kref *kref) clear_bit(IDEV_STOP_PENDING, &idev->flags); clear_bit(IDEV_IO_READY, &idev->flags); clear_bit(IDEV_GONE, &idev->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(IDEV_ALLOCATED, &idev->flags); wake_up(&ihost->eventq); } diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index c886ad1c39fb..73ab75ddaf42 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -951,7 +951,7 @@ static int tcm_loop_port_link( struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; atomic_inc(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * Add Linux/SCSI struct scsi_device by HCTL */ @@ -986,7 +986,7 @@ static void tcm_loop_port_unlink( scsi_device_put(sd); atomic_dec(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n"); } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index fcbe6125b73e..0b79b852f4b2 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -393,7 +393,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) continue; atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -404,7 +404,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -990,7 +990,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) * TARGET PORT GROUPS command */ atomic_inc(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&tg_pt_gp->tg_pt_gp_lock); spin_lock_bh(&port->sep_alua_lock); @@ -1020,7 +1020,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) spin_lock(&tg_pt_gp->tg_pt_gp_lock); atomic_dec(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&tg_pt_gp->tg_pt_gp_lock); /* @@ -1054,7 +1054,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state)); spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (tg_pt_gp->tg_pt_gp_transition_complete) @@ -1116,7 +1116,7 @@ static int core_alua_do_transition_tg_pt( */ spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { @@ -1159,7 +1159,7 @@ int core_alua_do_port_transition( spin_lock(&local_lu_gp_mem->lu_gp_mem_lock); lu_gp = local_lu_gp_mem->lu_gp; atomic_inc(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock); /* * For storage objects that are members of the 'default_lu_gp', @@ -1176,7 +1176,7 @@ int core_alua_do_port_transition( rc = core_alua_do_transition_tg_pt(l_tg_pt_gp, new_state, explicit); atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return rc; } /* @@ -1190,7 +1190,7 @@ int core_alua_do_port_transition( dev = lu_gp_mem->lu_gp_mem_dev; atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&lu_gp->lu_gp_lock); spin_lock(&dev->t10_alua.tg_pt_gps_lock); @@ -1219,7 +1219,7 @@ int core_alua_do_port_transition( tg_pt_gp->tg_pt_gp_alua_nacl = NULL; } atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); /* * core_alua_do_transition_tg_pt() will always return @@ -1230,7 +1230,7 @@ int core_alua_do_port_transition( spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); if (rc) break; } @@ -1238,7 +1238,7 @@ int core_alua_do_port_transition( spin_lock(&lu_gp->lu_gp_lock); atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&lu_gp->lu_gp_lock); @@ -1252,7 +1252,7 @@ int core_alua_do_port_transition( } atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return rc; } diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 65001e133670..72618776ede4 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -225,7 +225,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi( continue; atomic_inc(&deve->pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_irq(&nacl->device_list_lock); return deve; @@ -1392,7 +1392,7 @@ int core_dev_add_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_add_tail(&lacl->lacl_list, &lun->lun_acl_list); atomic_inc(&lun->lun_acl_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&lun->lun_acl_lock); pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for " @@ -1426,7 +1426,7 @@ int core_dev_del_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_del(&lacl->lacl_list); atomic_dec(&lun->lun_acl_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); spin_unlock(&lun->lun_acl_lock); core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun, diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 9e0232cca92e..7e6b857c6b3f 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -323,7 +323,7 @@ static void iblock_bio_done(struct bio *bio, int err) * Bump the ib_bio_err_cnt and release bio. */ atomic_inc(&ibr->ib_bio_err_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } bio_put(bio); diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 3013287a2aaa..df357862286e 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -675,7 +675,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_lock(&dev->se_port_lock); list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) { atomic_inc(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); spin_lock_bh(&port->sep_alua_lock); @@ -710,7 +710,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( continue; atomic_inc(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_bh(&port->sep_alua_lock); /* * Grab a configfs group dependency that is released @@ -723,9 +723,9 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( pr_err("core_scsi3_lunacl_depend" "_item() failed\n"); atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); goto out; } /* @@ -740,9 +740,9 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( sa_res_key, all_tg_pt, aptpl); if (!pr_reg_atp) { atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_lunacl_undepend_item(deve_tmp); goto out; } @@ -755,7 +755,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_lock(&dev->se_port_lock); atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&dev->se_port_lock); @@ -1110,7 +1110,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( continue; } atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1125,7 +1125,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( continue; atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1155,7 +1155,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg( static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg) { atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_check_implicit_release( @@ -1349,7 +1349,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg) &tpg->tpg_group.cg_item); atomic_dec(&tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl) @@ -1369,7 +1369,7 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) if (nacl->dynamic_node_acl) { atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return; } @@ -1377,7 +1377,7 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) &nacl->acl_group.cg_item); atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) @@ -1408,7 +1408,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) */ if (!lun_acl) { atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return; } nacl = lun_acl->se_lun_nacl; @@ -1418,7 +1418,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) &lun_acl->se_lun_group.cg_item); atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static sense_reason_t @@ -1552,14 +1552,14 @@ core_scsi3_decode_spec_i_port( continue; atomic_inc(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(tmp_tpg)) { pr_err(" core_scsi3_tpg_depend_item()" " for tmp_tpg\n"); atomic_dec(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; } @@ -1573,7 +1573,7 @@ core_scsi3_decode_spec_i_port( tmp_tpg, i_str); if (dest_node_acl) { atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } spin_unlock_irq(&tmp_tpg->acl_node_lock); @@ -1587,7 +1587,7 @@ core_scsi3_decode_spec_i_port( pr_err("configfs_depend_item() failed" " for dest_node_acl->acl_group\n"); atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_tpg_undepend_item(tmp_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; @@ -1647,7 +1647,7 @@ core_scsi3_decode_spec_i_port( pr_err("core_scsi3_lunacl_depend_item()" " failed\n"); atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_nodeacl_undepend_item(dest_node_acl); core_scsi3_tpg_undepend_item(dest_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -3168,14 +3168,14 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, continue; atomic_inc(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(dest_se_tpg)) { pr_err("core_scsi3_tpg_depend_item() failed" " for dest_se_tpg\n"); atomic_dec(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_put_pr_reg; } @@ -3273,7 +3273,7 @@ after_iport_check: initiator_str); if (dest_node_acl) { atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } spin_unlock_irq(&dest_se_tpg->acl_node_lock); @@ -3289,7 +3289,7 @@ after_iport_check: pr_err("core_scsi3_nodeacl_depend_item() for" " dest_node_acl\n"); atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dest_node_acl = NULL; ret = TCM_INVALID_PARAMETER_LIST; goto out; @@ -3314,7 +3314,7 @@ after_iport_check: if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item() failed\n"); atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dest_se_deve = NULL; ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out; @@ -3880,7 +3880,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) add_desc_len = 0; atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); /* * Determine expected length of $FABRIC_MOD specific @@ -3894,7 +3894,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) " out of buffer: %d\n", cmd->data_length); spin_lock(&pr_tmpl->registration_lock); atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); break; } /* @@ -3956,7 +3956,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) spin_lock(&pr_tmpl->registration_lock); atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); /* * Set the ADDITIONAL DESCRIPTOR LENGTH */ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index d4b98690a736..4badca1cd625 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -736,7 +736,7 @@ void target_qf_do_work(struct work_struct *work) list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) { list_del(&cmd->se_qf_node); atomic_dec(&dev->dev_qf_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue" " context: %s\n", cmd->se_tfo->get_fabric_name(), cmd, @@ -1148,7 +1148,7 @@ transport_check_alloc_task_attr(struct se_cmd *cmd) * Dormant to Active status. */ cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n", cmd->se_ordered_id, cmd->sam_task_attr, dev->transport->name); @@ -1705,7 +1705,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) return false; case MSG_ORDERED_TAG: atomic_inc(&dev->dev_ordered_sync); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, " " se_ordered_id: %u\n", @@ -1723,7 +1723,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) * For SIMPLE and UNTAGGED Task Attribute commands */ atomic_inc(&dev->simple_cmds); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); break; } @@ -1828,7 +1828,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) if (cmd->sam_task_attr == MSG_SIMPLE_TAG) { atomic_dec(&dev->simple_cmds); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dev->dev_cur_ordered_id++; pr_debug("Incremented dev->dev_cur_ordered_id: %u for" " SIMPLE: %u\n", dev->dev_cur_ordered_id, @@ -1840,7 +1840,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) cmd->se_ordered_id); } else if (cmd->sam_task_attr == MSG_ORDERED_TAG) { atomic_dec(&dev->dev_ordered_sync); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:" @@ -1899,7 +1899,7 @@ static void transport_handle_queue_full( spin_lock_irq(&dev->qf_cmd_lock); list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list); atomic_inc(&dev->dev_qf_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_irq(&cmd->se_dev->qf_cmd_lock); schedule_work(&cmd->se_dev->qf_work_queue); @@ -2875,7 +2875,7 @@ void transport_send_task_abort(struct se_cmd *cmd) if (cmd->se_tfo->write_pending_status(cmd) != 0) { cmd->transport_state |= CMD_T_ABORTED; cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS; - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return; } } diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c index 505519b10cb7..101858e245b3 100644 --- a/drivers/target/target_core_ua.c +++ b/drivers/target/target_core_ua.c @@ -162,7 +162,7 @@ int core_scsi3_ua_allocate( spin_unlock_irq(&nacl->device_list_lock); atomic_inc(&deve->ua_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } list_add_tail(&ua->ua_nacl_list, &deve->ua_list); @@ -175,7 +175,7 @@ int core_scsi3_ua_allocate( asc, ascq); atomic_inc(&deve->ua_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } @@ -190,7 +190,7 @@ void core_scsi3_ua_release_all( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); } @@ -251,7 +251,7 @@ void core_scsi3_ua_for_check_condition( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); @@ -310,7 +310,7 @@ int core_scsi3_ua_clear_for_request_sense( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 41fe8a047d37..746ae80b972f 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2041,7 +2041,7 @@ static int canon_copy_from_read_buf(struct tty_struct *tty, if (found) clear_bit(eol, ldata->read_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); ldata->read_tail += c; if (found) { diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c index aa97fd845b4d..4b5b3c2fe328 100644 --- a/drivers/tty/serial/mxs-auart.c +++ b/drivers/tty/serial/mxs-auart.c @@ -200,7 +200,7 @@ static void dma_tx_callback(void *param) /* clear the bit used to serialize the DMA tx. */ clear_bit(MXS_AUART_DMA_TX_SYNC, &s->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* wake up the possible processes. */ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) @@ -275,7 +275,7 @@ static void mxs_auart_tx_chars(struct mxs_auart_port *s) mxs_auart_dma_tx(s, i); } else { clear_bit(MXS_AUART_DMA_TX_SYNC, &s->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } return; } diff --git a/drivers/usb/gadget/tcm_usb_gadget.c b/drivers/usb/gadget/tcm_usb_gadget.c index f058c0368d61..819875c7e394 100644 --- a/drivers/usb/gadget/tcm_usb_gadget.c +++ b/drivers/usb/gadget/tcm_usb_gadget.c @@ -1851,7 +1851,7 @@ static int usbg_port_link(struct se_portal_group *se_tpg, struct se_lun *lun) struct usbg_tpg *tpg = container_of(se_tpg, struct usbg_tpg, se_tpg); atomic_inc(&tpg->tpg_port_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } @@ -1861,7 +1861,7 @@ static void usbg_port_unlink(struct se_portal_group *se_tpg, struct usbg_tpg *tpg = container_of(se_tpg, struct usbg_tpg, se_tpg); atomic_dec(&tpg->tpg_port_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int usbg_check_stop_free(struct se_cmd *se_cmd) diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c index 640fe0173236..f1ec1680e822 100644 --- a/drivers/usb/serial/usb_wwan.c +++ b/drivers/usb/serial/usb_wwan.c @@ -325,7 +325,7 @@ static void usb_wwan_outdat_callback(struct urb *urb) for (i = 0; i < N_OUT_URB; ++i) { if (portdata->out_urbs[i] == urb) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(i, &portdata->out_busy); break; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index cf50ce93975b..aeb513108448 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1255,7 +1255,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, tpg->tv_tpg_vhost_count++; tpg->vhost_scsi = vs; vs_tpg[tpg->tport_tpgt] = tpg; - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); match = true; } mutex_unlock(&tpg->tv_tpg_mutex); diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c index 3bff6b37b472..3651ec801f45 100644 --- a/drivers/w1/w1_family.c +++ b/drivers/w1/w1_family.c @@ -139,9 +139,9 @@ void w1_family_get(struct w1_family *f) void __w1_family_get(struct w1_family *f) { - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&f->refcnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } EXPORT_SYMBOL(w1_unregister_family); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 607e41460c0d..c4a0666de6f5 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -348,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data) notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ /* Check to see if the driver domain tried to start another request in * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9a..2256e9cceec5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,7 +279,7 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475ceec..f29a54e454d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3458,7 +3458,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f805bc944fa..5a3b8371772e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7126,7 +7126,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7306,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7449,7 +7449,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e79ff6b90cb7..f45040a4bb76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -642,7 +642,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EINVAL; atomic_inc(&root->will_be_snapshoted); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); btrfs_wait_nocow_write(root); ret = btrfs_start_delalloc_inodes(root, 0); diff --git a/fs/buffer.c b/fs/buffer.c index 9ddb9fc7d923..6a8110c03a47 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f3b84cd9de56..08b3c116915b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index aec7f73832f0..c355f7320e44 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -277,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); } @@ -411,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } @@ -620,7 +620,7 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; @@ -628,7 +628,7 @@ out_sched: out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54b66809e818..74d9a3dbf16f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -221,7 +221,7 @@ static void inode_go_sync(struct gfs2_glock *gl) * Writeback of the data mapping may cause the dirty flag to be set * so we have to clear it again here. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c1eb555dc588..91f274de1246 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1134,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); spin_unlock(&ls->ls_recover_spin); } @@ -1271,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); return 0; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 7ad4094d68c0..fe7a56fb6084 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -587,7 +587,7 @@ fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index de25d5577e5d..529d9a9eb897 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -333,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { ret = -EINVAL; @@ -482,7 +482,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); out: spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5f26139a165a..6fac74349856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) clear_buffer_uptodate(bh); if (orig_bh) { clear_bit_unlock(BH_Shadow, &orig_bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&orig_bh->b_state, BH_Shadow); } unlock_buffer(bh); @@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9f3d067cd15..4a3d4ef76127 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2032,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); kfree(entry); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) @@ -2082,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } @@ -2232,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) nfs_access_add_rbtree(inode, cache); /* Update accounting */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c438973f3c8..e6f7398d2b3c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1085,7 +1085,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index efac602edb37..b9c61efe9660 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2349518eef2c..c0583b9bef71 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1140,9 +1140,9 @@ static int nfs4_run_state_manager(void *); static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 2ffebf2081ce..03ed984ab4d8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -95,7 +95,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c) { if (atomic_dec_and_test(&c->io_count)) { clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&c->flags, NFS_IO_INPROGRESS); } } @@ -193,9 +193,9 @@ void nfs_unlock_request(struct nfs_page *req) printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&req->wb_flags, PG_BUSY); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cb53d450ae32..fd9536e494bc 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1810,7 +1810,7 @@ static void pnfs_clear_layoutcommitting(struct inode *inode) unsigned long *bitlock = &NFS_I(inode)->flags; clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 023793909778..c3058a076596 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -275,7 +275,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } return lseg; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9a3b6a4cd6b9..ffb9459f180b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -405,7 +405,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_FLUSHING); if (err < 0) @@ -1458,7 +1458,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 4b826abb1528..45d4e96a6bac 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c) * important. */ clear_bit(DIRTY_CNODE, &cnode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_CNODE, &cnode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); offs += len; dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 52a6559275c4..3600994f8411 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c) * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_ZNODE, &znode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We have marked the znode as clean but have not updated the diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index 9ae6c34dc191..49673510b484 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -80,7 +80,7 @@ static inline void set_bit(int nr, volatile unsigned long *addr) * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static inline void clear_bit(int nr, volatile unsigned long *addr) diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 308a9e22c802..c30266e94806 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -20,7 +20,7 @@ */ #define clear_bit_unlock(nr, addr) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(nr, addr); \ } while (0) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c40302f909ce..7cbf837a279c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -278,7 +278,7 @@ static inline void get_bh(struct buffer_head *bh) static inline void put_bh(struct buffer_head *bh) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&bh->b_count); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9f3c275e053e..ec274e0f4ed2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -649,7 +649,7 @@ static inline void hd_ref_init(struct hd_struct *part) static inline void hd_struct_get(struct hd_struct *part) { atomic_inc(&part->ref); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static inline int hd_struct_try_get(struct hd_struct *part) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c7bfac1c4a7b..157111043281 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -453,7 +453,7 @@ static inline int tasklet_trylock(struct tasklet_struct *t) static inline void tasklet_unlock(struct tasklet_struct *t) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } @@ -501,7 +501,7 @@ static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static inline void tasklet_disable(struct tasklet_struct *t) @@ -513,13 +513,13 @@ static inline void tasklet_disable(struct tasklet_struct *t) static inline void tasklet_enable(struct tasklet_struct *t) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&t->count); } static inline void tasklet_hi_enable(struct tasklet_struct *t) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&t->count); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ed3a3aa6604..616415a4fee4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -493,7 +493,7 @@ static inline void napi_disable(struct napi_struct *n) static inline void napi_enable(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f757..010cde3b44cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2782,10 +2782,8 @@ static inline bool __must_check current_set_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, * paired by resched_task() - * - * XXX: assumes set/clear bit are identical barrier wise. */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } @@ -2803,7 +2801,7 @@ static inline bool __must_check current_clr_polling_and_test(void) * Polling state must be visible before we test NEED_RESCHED, * paired by resched_task() */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 3a847de83fab..ad7dbe2cfecd 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -142,18 +142,18 @@ struct rpc_task_setup { test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_clear_running(t) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ - smp_mb__after_clear_bit(); \ + smp_mb__after_atomic(); \ } while (0) #define RPC_IS_QUEUED(t) test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define rpc_set_queued(t) set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define rpc_clear_queued(t) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \ - smp_mb__after_clear_bit(); \ + smp_mb__after_atomic(); \ } while (0) #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3e5efb2b236e..3876f0f1dfd3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -379,9 +379,9 @@ static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt) static inline void xprt_clear_connecting(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static inline int xprt_connecting(struct rpc_xprt *xprt) @@ -411,9 +411,9 @@ static inline void xprt_clear_bound(struct rpc_xprt *xprt) static inline void xprt_clear_binding(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_BINDING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 1e98b5530425..6f8ab7da27c4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -191,7 +191,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) * pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); } diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 5679d927562b..624a8a54806d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1204,7 +1204,7 @@ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); @@ -1408,7 +1408,7 @@ static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&dest->refcnt); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2956c8da1605..1adf62b39b96 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -534,7 +534,7 @@ return_normal: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -662,7 +662,7 @@ kgdb_restore: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&masters_in_kgdb); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..b991ec05b8f9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key) * get_futex_key() implies a full barrier. This is relied upon * as full barrier (B), see the ordering comment above. */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /* @@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb) /* * Full barrier (A), see the ordering comment above. */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); #endif } diff --git a/kernel/kmod.c b/kernel/kmod.c index 6b375af4958d..0ac67a5861c5 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -498,7 +498,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) static void helper_lock(void) { atomic_inc(&running_helpers); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static void helper_unlock(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..88b4a1dcb58c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -387,9 +387,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* @@ -507,10 +507,10 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); @@ -635,10 +635,10 @@ void rcu_nmi_enter(void) (atomic_read(&rdtp->dynticks) & 0x1)) return; rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + smp_mb__before_atomic(); /* Force delay from prior write. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } @@ -657,9 +657,9 @@ void rcu_nmi_exit(void) --rdtp->dynticks_nmi_nesting != 0) return; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ + smp_mb__after_atomic(); /* Force delay to next write. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } @@ -2790,7 +2790,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); return; } @@ -2808,7 +2808,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); return; } @@ -2837,7 +2837,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_done_lost); break; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..56db2f853e43 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2523,9 +2523,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) /* Record start of fully idle period. */ j = jiffies; ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); } @@ -2590,9 +2590,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) } /* Record end of idle period. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); /* diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..746bc9344969 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * do a write memory barrier, and then update the count, to * make sure the vector is visible when count is set. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&(vec)->count); do_mb = 1; } @@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * the new priority vec. */ if (do_mb) - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * When removing from the vector, we decrement the counter first * do a memory barrier and then clear the mask. */ atomic_dec(&(vec)->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); cpumask_clear_cpu(cpu, vec->mask); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e248..0ffa20ae657b 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit); * * In order for this to function properly, as it uses waitqueue_active() * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * this. Typically, this will be smp_mb__after_atomic(), but in some * cases where bitflags are manipulated non-atomically under a lock, one * may need to use a less regular barrier, such fs/inode.c's smp_mb(), * because spin_unlock() does not guarantee a memory barrier. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 09d9591b7708..1706cbbdf5f0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) bit = sync ? BDI_sync_congested : BDI_async_congested; if (test_and_clear_bit(bit, &bdi->state)) atomic_dec(&nr_bdi_congested[sync]); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } diff --git a/mm/filemap.c b/mm/filemap.c index a82fbe4c9e8e..c73535c914cc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -740,7 +740,7 @@ void unlock_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -757,7 +757,7 @@ void end_page_writeback(struct page *page) if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 8c93267ce969..c4e09846d1de 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -252,7 +252,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) * we need to ensure there's a memory barrier after it. The bit * *must* be set before we do the atomic_inc() on pvcc->inflight. * There's no smp_mb__after_set_bit(), so it's this or abuse - * smp_mb__after_clear_bit(). + * smp_mb__after_atomic(). */ test_and_set_bit(BLOCKED, &pvcc->blocked); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 49774912cb01..74014420b3c7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -45,7 +45,7 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) return; clear_bit(HCI_INQUIRY, &hdev->flags); - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_conn_check_pending(hdev); @@ -1768,7 +1768,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); if (!test_bit(HCI_MGMT, &hdev->dev_flags)) diff --git a/net/core/dev.c b/net/core/dev.c index 5b3042e69f85..e14f1cba591a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1326,7 +1326,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -3343,7 +3343,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3353,7 +3353,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -4244,7 +4244,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322ba..bd0767e6b2b3 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f424465112..56cd458a1b8c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -522,7 +522,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 025e25093984..366cf06587b8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1930,10 +1930,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. - * We abuse smp_mb__after_clear_bit() because - * there is no smp_mb__after_set_bit() yet */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (atomic_read(&sk->sk_wmem_alloc) > limit) break; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 75421f2ba8be..1f4f954c4b47 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -914,7 +914,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b7ebe23cdedf..d67de453c35a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -598,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -606,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 45033358358e..aa8bf6786008 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -429,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -437,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/send.c b/net/rds/send.c index a82fb660ec00..23718160d71e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -107,7 +107,7 @@ static int acquire_in_xmit(struct rds_connection *conn) static void release_in_xmit(struct rds_connection *conn) { clear_bit(RDS_IN_XMIT, &conn->c_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk @@ -661,7 +661,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); @@ -691,7 +691,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) } /* order flag updates with the rs lock */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 81cf5a4c5e40..53b17ca0dff5 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -93,7 +93,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5285ead196c0..247e973544bf 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -296,7 +296,7 @@ static void rpcauth_unhash_cred_locked(struct rpc_cred *cred) { hlist_del_rcu(&cred->cr_hash); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 36e431ee1c90..b6e440baccc3 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -143,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3513d559bc45..9761a0da964d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -244,10 +244,10 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); req->rq_connect_cookie = xprt->connect_cookie - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (!xprt_need_to_requeue(xprt)) { /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d173f79947c6..89d051de6b3e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -230,9 +230,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } else queue_work(rpciod_workqueue, &xprt->task_cleanup); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 25a3dcf15cae..402a7e9a16b7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -893,11 +893,11 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xprt_disconnect_done(xprt); } @@ -1497,12 +1497,12 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void xs_sock_mark_closed(struct rpc_xprt *xprt) @@ -1556,10 +1556,10 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: @@ -1578,9 +1578,9 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case TCP_CLOSE: xs_tcp_cancel_linger_timeout(xprt); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bb7e8ba821f4..749f80c21e22 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1207,7 +1207,7 @@ restart: sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); - smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); diff --git a/sound/pci/bt87x.c b/sound/pci/bt87x.c index 8546711d12f9..70951fd9b354 100644 --- a/sound/pci/bt87x.c +++ b/sound/pci/bt87x.c @@ -443,7 +443,7 @@ static int snd_bt87x_pcm_open(struct snd_pcm_substream *substream) _error: clear_bit(0, &chip->opened); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return err; } @@ -458,7 +458,7 @@ static int snd_bt87x_close(struct snd_pcm_substream *substream) chip->substream = NULL; clear_bit(0, &chip->opened); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } -- cgit v1.2.3 From 1413c03893332366e5b4d1e26f942ada25f3e82a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 8 Jan 2014 14:21:46 -0500 Subject: lockdep: Increase static allocations Fuzzing a recent kernel with a large configuration hits the static allocation limits and disables lockdep. This patch doubles the limits. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389208906-24338-1-git-send-email-sasha.levin@oracle.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..51c4b24b6328 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_ENTRIES 32768UL -#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS_BITS 16 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -65,7 +65,7 @@ enum { * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 262144UL +#define MAX_STACK_TRACE_ENTRIES 524288UL extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -- cgit v1.2.3 From c04ae71c9c264312a6f57d2665a79f7bbccf8758 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 8 Apr 2014 17:33:19 -0700 Subject: sched_clock: Remove deprecated setup_sched_clock() API Remove the 32-bit only setup_sched_clock() API now that all users have been converted to the 64-bit friendly sched_clock_register(). Signed-off-by: Stephen Boyd Signed-off-by: John Stultz --- include/linux/sched_clock.h | 1 - kernel/time/sched_clock.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index cddf0c2940b6..efa931c5cef1 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,6 @@ extern void sched_clock_postinit(void); static inline void sched_clock_postinit(void) { } #endif -extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate); extern void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ - return read_sched_clock_32(); -} - static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, pr_debug("Registered %pF as sched_clock source\n", read); } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - read_sched_clock_32 = read; - sched_clock_register(read_sched_clock_32_wrapper, bits, rate); -} - void __init sched_clock_postinit(void) { /* -- cgit v1.2.3 From 86d56134f1b67d0c18025ba5cade95c048ed528d Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Thu, 10 Apr 2014 14:09:31 -0700 Subject: kobject: Make support for uevent_helper optional. Support for uevent_helper, aka hotplug, is not required on many systems these days but it can still be enabled via sysfs or sysctl. Reported-by: Darren Shepherd Signed-off-by: Michael Marineau Signed-off-by: Greg Kroah-Hartman --- drivers/base/Kconfig | 17 +++++++++++------ include/linux/kobject.h | 2 ++ kernel/ksysfs.c | 5 ++++- kernel/sysctl.c | 4 ++-- lib/kobject_uevent.c | 6 ++++++ 5 files changed, 25 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 8fa8deab6449..4b7b4522b64f 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -1,10 +1,10 @@ menu "Generic Driver Options" -config UEVENT_HELPER_PATH - string "path to uevent helper" - default "" +config UEVENT_HELPER + bool "Support for uevent helper" + default y help - Path to uevent helper program forked by the kernel for + The uevent helper program is forked by the kernel for every uevent. Before the switch to the netlink-based uevent source, this was used to hook hotplug scripts into kernel device events. It @@ -15,8 +15,13 @@ config UEVENT_HELPER_PATH that it creates a high system load, or on smaller systems it is known to create out-of-memory situations during bootup. - To disable user space helper program execution at early boot - time specify an empty string here. This setting can be altered +config UEVENT_HELPER_PATH + string "path to uevent helper" + depends on UEVENT_HELPER + default "" + help + To disable user space helper program execution at by default + specify an empty string here. This setting can still be altered via /proc/sys/kernel/hotplug or via /sys/kernel/uevent_helper later at runtime. diff --git a/include/linux/kobject.h b/include/linux/kobject.h index f896a33e8341..2d61b909f414 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -32,8 +32,10 @@ #define UEVENT_NUM_ENVP 32 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ +#ifdef CONFIG_UEVENT_HELPER /* path to the userspace helper executed on an event */ extern char uevent_helper[]; +#endif /* counter to tag the uevent, read only except for the kobject core */ extern u64 uevent_seqnum; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2495a9b14ac8..6683ccef9fff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); +#ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(uevent_helper); - +#endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, @@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, +#ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, +#endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..bc966a8ffc3e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif - +#ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", .data = &uevent_helper, @@ -651,7 +651,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - +#endif #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 4e3bd71bd949..9ebf9e20de53 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -29,7 +29,9 @@ u64 uevent_seqnum; +#ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; +#endif #ifdef CONFIG_NET struct uevent_sock { struct list_head list; @@ -109,6 +111,7 @@ static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) } #endif +#ifdef CONFIG_UEVENT_HELPER static int kobj_usermode_filter(struct kobject *kobj) { const struct kobj_ns_type_operations *ops; @@ -147,6 +150,7 @@ static void cleanup_uevent_env(struct subprocess_info *info) { kfree(info->data); } +#endif /** * kobject_uevent_env - send an uevent with environmental data @@ -323,6 +327,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, #endif mutex_unlock(&uevent_sock_mutex); +#ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; @@ -347,6 +352,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, env = NULL; /* freed by cleanup_uevent_env */ } } +#endif exit: kfree(devpath); -- cgit v1.2.3 From 4881f603d7b82df2bc15efd2a272f973a3bf8df1 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 25 Apr 2014 08:44:59 +0800 Subject: PM / hibernate: use unsigned local variables in swsusp_show_speed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_div() needs 'u64' type, or it reports warning. And negative number is meaningless for "speed", so change all signed to unsigned within swsusp_show_speed(). The related warning (with allmodconfig for unicore32): CC kernel/power/hibernate.o kernel/power/hibernate.c: In function ‘swsusp_show_speed’: kernel/power/hibernate.c:237: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Chen Gang [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..de4b989cc8fd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -228,19 +228,23 @@ static void platform_recover(int platform_mode) void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); -- cgit v1.2.3 From 91dc95427a0d30ac2c58d6e943c7f40a3f25d908 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Feb 2014 09:47:13 -0800 Subject: rcu: Protect ->gp_flags accesses with ACCESS_ONCE() A number of ->gp_flags accesses don't have ACCESS_ONCE(), but all of the can race against other loads or stores. This commit therefore applies ACCESS_ONCE() to the unprotected ->gp_flags accesses. Reported-by: Alexey Roytman Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..2c53ac924cab 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1403,12 +1403,12 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - if (rsp->gp_flags == 0) { + if (!ACCESS_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); return 0; } - rsp->gp_flags = 0; /* Clear all flags: New grace period. */ + ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { /* @@ -1501,7 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1566,7 +1566,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) { - rsp->gp_flags = RCU_GP_FLAG_INIT; + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); @@ -1695,7 +1695,7 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, */ return; } - rsp->gp_flags = RCU_GP_FLAG_INIT; + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); @@ -2320,7 +2320,7 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - rsp->gp_flags |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } -- cgit v1.2.3 From 24342c963a7fc04ec8f199778427943509f0bd5e Mon Sep 17 00:00:00 2001 From: Liu Ping Fan Date: Mon, 24 Feb 2014 06:18:09 -0800 Subject: rcu: Fix incorrect notes for code Signed-off-by: Liu Ping Fan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..f9c9057239a9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2109,7 +2109,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } #ifndef CONFIG_RCU_NOCB_CPU_ALL -/* Is the specified CPU a no-CPUs CPU? */ +/* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) -- cgit v1.2.3 From 83ebe63ead0fe60e4b548730800cb68293ce098b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Mar 2014 11:09:10 -0800 Subject: rcu: Print negatives for stall-warning counter wraparound The print_other_cpu_stall() and print_cpu_stall() functions print grace-period numbers using an unsigned format, which means that the number one less than zero is a very large number. This commit therefore causes these numbers to be printed with a signed format in order to improve readability of the RCU CPU stall-warning output. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2c53ac924cab..b33c29a99df3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -932,9 +932,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed, totqlen); + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -971,8 +971,9 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); + pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + jiffies - rsp->gp_start, + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); -- cgit v1.2.3 From 365187fbc04fd55766bf6a94e37e558505bf480a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Mar 2014 10:55:52 -0700 Subject: rcu: Update cpu_needs_another_gp() for futures from non-NOCB CPUs In the old days, the only source of requests for future grace periods was NOCB CPUs. This has changed: CPUs routinely post requests for future grace periods in order to promote power efficiency and reduce OS jitter with minimal impact on grace-period latency. This commit therefore updates cpu_needs_another_gp() to invoke rcu_future_needs_gp() instead of rcu_nocb_needs_gp(). The latter is no longer used, so is now removed. This commit also adds tracing for the irq_work_queue() wakeup case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++---------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 18 ------------------ 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b33c29a99df3..b4688993e956 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -323,6 +323,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) rdp->nxttail[RCU_DONE_TAIL] != NULL; } +/* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +/* + * Is there any need for future grace periods? + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_future_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int *fp = &rnp->need_future_gp[idx]; + + return ACCESS_ONCE(*fp); +} + /* * Does the current CPU require a not-yet-started grace period? * The caller must have disabled interrupts to prevent races with @@ -335,7 +357,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rsp)) + if (rcu_future_needs_gp(rsp)) return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ @@ -349,14 +371,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* No grace period needed. */ } -/* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - /* * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * @@ -1672,6 +1686,8 @@ static void rsp_wakeup(struct irq_work *work) /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + "Workqueuewoken"); } /* @@ -1706,8 +1722,11 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * the wakeup to interrupt context. And don't bother waking * up the running kthread. */ - if (current != rsp->gp_kthread) + if (current != rsp->gp_kthread) { + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + "Workqueuewake"); irq_work_queue(&rsp->wakeup_work); + } } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..7b572c5c65e1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -547,7 +547,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_state *rsp); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f9c9057239a9..f60dd6ea8333 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2067,19 +2067,6 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); -/* - * Do any no-CBs CPUs need another grace period? - * - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; -} - /* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. @@ -2402,11 +2389,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - return 0; -} - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit v1.2.3 From 9b67122ae3da3018c966148233739116ed89502a Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Tue, 11 Mar 2014 13:18:22 +0200 Subject: rcu: Remove unused rcu_data structure field The ->preemptible field in rcu_data is only initialized in the function rcu_init_percpu_data(), and never used. This commit therefore removes this field. Signed-off-by: Iulia Manda Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 ++---- kernel/rcu/tree.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b4688993e956..2cc39c781085 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3180,7 +3180,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; unsigned long mask; @@ -3193,7 +3193,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3237,8 +3236,7 @@ static void rcu_prepare_cpu(int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp, - strcmp(rsp->name, "rcu_preempt") == 0); + rcu_init_percpu_data(cpu, rsp); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7b572c5c65e1..13766ad2f4ee 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -252,7 +252,6 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO -- cgit v1.2.3 From 4fc5b75537d4f56577ad00355b4cd09627deb3c3 Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Tue, 11 Mar 2014 15:22:28 +0200 Subject: rcu: Protect uses of jiffies_stall field with ACCESS_ONCE() Some of the uses of the rcu_state structure's ->jiffies_stall field do not use ACCESS_ONCE(), despite there being unprotected accesses. This commit therefore uses the ACCESS_ONCE() macro to protect this field. Signed-off-by: Iulia Manda Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2cc39c781085..c624415f8386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -865,7 +865,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - rsp->jiffies_stall = j + j1; + ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; } @@ -904,12 +904,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -992,8 +992,8 @@ static void print_cpu_stall(struct rcu_state *rsp) dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = jiffies + + if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1077,7 +1077,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rsp->jiffies_stall = jiffies + ULONG_MAX / 2; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; } /* -- cgit v1.2.3 From 48a7639ce80cf279834d0d44865e49ecd714f37d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Mar 2014 13:02:16 -0700 Subject: rcu: Make callers awaken grace-period kthread The rcu_start_gp_advanced() function currently uses irq_work_queue() to defer wakeups of the RCU grace-period kthread. This deferring is necessary to avoid RCU-scheduler deadlocks involving the rcu_node structure's lock, meaning that RCU cannot call any of the scheduler's wake-up functions while holding one of these locks. Unfortunately, the second and subsequent calls to irq_work_queue() are ignored, and the first call will be ignored (aside from queuing the work item) if the scheduler-clock tick is turned off. This is OK for many uses, especially those where irq_work_queue() is called from an interrupt or softirq handler, because in those cases the scheduler-clock-tick state will be re-evaluated, which will turn the scheduler-clock tick back on. On the next tick, any deferred work will then be processed. However, this strategy does not always work for RCU, which can be invoked at process level from idle CPUs. In this case, the tick might never be turned back on, indefinitely defering a grace-period start request. Note that the RCU CPU stall detector cannot see this condition, because there is no RCU grace period in progress. Therefore, we can (and do!) see long tens-of-seconds stalls in grace-period handling. In theory, we could see a full grace-period hang, but rcutorture testing to date has seen only the tens-of-seconds stalls. Event tracing demonstrates that irq_work_queue() is being called repeatedly to no effect during these stalls: The "newreq" event appears repeatedly from a task that is not one of the grace-period kthreads. In theory, irq_work_queue() might be fixed to avoid this sort of issue, but RCU's requirements are unusual and it is quite straightforward to pass wake-up responsibility up through RCU's call chain, so that the wakeup happens when the offending locks are released. This commit therefore makes this change. The rcu_start_gp_advanced(), rcu_start_future_gp(), rcu_accelerate_cbs(), rcu_advance_cbs(), __note_gp_changes(), and rcu_start_gp() functions now return a boolean which indicates when a wake-up is needed. A new rcu_gp_kthread_wake() does the wakeup when it is necessary and safe to do so: No self-wakes, no wake-ups if the ->gp_flags field indicates there is no need (as in someone else did the wake-up before we got around to it), and no wake-ups before the grace-period kthread has been created. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 137 +++++++++++++++++++++++++++++------------------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 10 +++- 3 files changed, 94 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c624415f8386..fca911b6b29c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp, bool *isidle, @@ -1138,15 +1138,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start some future grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. + * rcu_node structure's ->need_future_gp field. Returns true if there + * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock. */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +static bool __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long *c_out) { unsigned long c; int i; + bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); /* @@ -1157,7 +1160,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - return c; + goto out; } /* @@ -1171,7 +1174,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - return c; + goto out; } /* @@ -1212,12 +1215,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: if (rnp != rnp_root) raw_spin_unlock(&rnp_root->lock); - return c; +out: + if (c_out != NULL) + *c_out = c; + return ret; } /* @@ -1240,6 +1246,22 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) return needmore; } +/* + * Awaken the grace-period kthread for the specified flavor of RCU. + * Don't do a self-awaken, and don't bother awakening when there is + * nothing for the grace-period kthread to do (as in several CPUs + * raced to awaken, and we lost), and finally don't try to awaken + * a kthread that has not yet been created. + */ +static void rcu_gp_kthread_wake(struct rcu_state *rsp) +{ + if (current == rsp->gp_kthread || + !ACCESS_ONCE(rsp->gp_flags) || + !rsp->gp_kthread) + return; + wake_up(&rsp->gp_wq); +} + /* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any @@ -1247,19 +1269,21 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) * since proven to be too conservative, which can happen if callbacks get * assigned a ->completed number while RCU is idle, but with reference to * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. + * not hurt to call it repeatedly. Returns an flag saying that we should + * awaken the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; int i; + bool ret; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Starting from the sublist containing the callbacks most @@ -1288,7 +1312,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * be grouped into. */ if (++i >= RCU_NEXT_TAIL) - return; + return false; /* * Assign all subsequent callbacks' ->completed number to the next @@ -1300,13 +1324,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxtcompleted[i] = c; } /* Record any needed additional grace periods. */ - rcu_start_future_gp(rnp, rdp); + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + return ret; } /* @@ -1315,17 +1340,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... + * Returns true if the RCU grace-period kthread needs to be awakened. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { int i, j; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Find all callbacks whose ->completed numbers indicate that they @@ -1349,26 +1375,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* Classify any remaining callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rsp, rnp, rdp); } /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. + * Returns true if the grace-period kthread needs to be awakened. */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { + bool ret; + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { /* No grace period end, so just accelerate recent callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + ret = rcu_accelerate_cbs(rsp, rnp, rdp); } else { /* Advance callbacks. */ - rcu_advance_cbs(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1387,11 +1417,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } + return ret; } static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + bool needwake; struct rcu_node *rnp; local_irq_save(flags); @@ -1403,8 +1435,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } smp_mb__after_unlock_lock(); - __note_gp_changes(rsp, rnp, rdp); + needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } /* @@ -1468,7 +1502,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, @@ -1528,6 +1562,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + bool needgp = false; int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1563,7 +1598,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); @@ -1579,8 +1614,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); - rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) { + /* Advance CBs to reduce false positives below. */ + needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; + if (needgp || cpu_needs_another_gp(rsp, rdp)) { ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1680,16 +1716,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } } -static void rsp_wakeup(struct irq_work *work) -{ - struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); - - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - "Workqueuewoken"); -} - /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1698,8 +1724,10 @@ static void rsp_wakeup(struct irq_work *work) * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. + * + * Returns true if the grace-period kthread must be awakened. */ -static void +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { @@ -1710,7 +1738,7 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - return; + return false; } ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1719,14 +1747,9 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, /* * We can't do wakeups while holding the rnp->lock, as that * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to interrupt context. And don't bother waking - * up the running kthread. + * the wakeup to our caller. */ - if (current != rsp->gp_kthread) { - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - "Workqueuewake"); - irq_work_queue(&rsp->wakeup_work); - } + return true; } /* @@ -1735,12 +1758,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * is invoked indirectly from rcu_advance_cbs(), which would result in * endless recursion -- or would do so if it wasn't for the self-deadlock * that is encountered beforehand. + * + * Returns true if the grace-period kthread needs to be awakened. */ -static void -rcu_start_gp(struct rcu_state *rsp) +static bool rcu_start_gp(struct rcu_state *rsp) { struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); + bool ret = false; /* * If there is no grace period in progress right now, any @@ -1750,8 +1775,9 @@ rcu_start_gp(struct rcu_state *rsp) * resulting in pointless grace periods. So, advance callbacks * then start the grace period! */ - rcu_advance_cbs(rsp, rnp, rdp); - rcu_start_gp_advanced(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; + ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; + return ret; } /* @@ -1840,6 +1866,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; + bool needwake; struct rcu_node *rnp; rnp = rdp->mynode; @@ -1868,9 +1895,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + if (needwake) + rcu_gp_kthread_wake(rsp); } } @@ -2354,6 +2383,7 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + bool needwake; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -2365,8 +2395,10 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { local_irq_restore(flags); } @@ -2424,6 +2456,8 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { + bool needwake; + /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2453,8 +2487,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock(&rnp_root->lock); smp_mb__after_unlock_lock(); - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -3440,7 +3476,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); - init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 13766ad2f4ee..cdbb392d5b4b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -461,7 +461,6 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ - struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f60dd6ea8333..0cb0816036c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1744,6 +1744,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) static void rcu_prepare_for_idle(int cpu) { #ifndef CONFIG_RCU_NOCB_CPU_ALL + bool needwake; struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct rcu_node *rnp; @@ -1792,8 +1793,10 @@ static void rcu_prepare_for_idle(int cpu) rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } @@ -2230,12 +2233,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) unsigned long c; bool d; unsigned long flags; + bool needwake; struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - c = rcu_start_future_gp(rnp, rdp); + needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); /* * Wait for the grace period. Do so interruptibly to avoid messing -- cgit v1.2.3 From af952b919bf9e2cf3c4e839359cfd033d98aa011 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 12 Mar 2014 21:46:46 +0530 Subject: rcu: Protect uses of ->jiffies_stall with ACCESS_ONCE() Some of the accesses to the rcu_state structure's ->jiffies_stall field are unprotected. This patch protects them with ACCESS_ONCE(). The following coccinelle script was used to acheive this: /* coccinelle script to protect uses of ->jiffies_stall with ACCESS_ONCE() */ @@ identifier a; @@ ( ACCESS_ONCE(a->jiffies_stall) | - a->jiffies_stall + ACCESS_ONCE(a->jiffies_stall) ) Signed-off-by: Himangi Saraogi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tiny_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 431528520562..858c56569127 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = rcp->jiffies_stall; + js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) dump_stack(); } if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void check_cpu_stalls(void) -- cgit v1.2.3 From 7941dbdebe2a1fda31aefa033794510f95720a5a Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Mon, 17 Mar 2014 18:33:28 +0200 Subject: rcu: Add event tracing to dyntick_save_progress_counter(). This patch adds event tracing to dyntick_save_progress_counter() in the case where it returns 1. I used the tracepoint string "dti" because this function returns 1 in case the CPU is in dynticks idle mode. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fca911b6b29c..c6fd0f15425c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -772,7 +772,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - return (rdp->dynticks_snap & 0x1) == 0; + if ((rdp->dynticks_snap & 0x1) == 0) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + return 1; + } else { + return 0; + } } /* -- cgit v1.2.3 From 595f3900f6b4221403493c530138b8dad2bd87f3 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Tue, 18 Mar 2014 22:52:26 +0530 Subject: rcu: Replace NR_CPUS with nr_cpu_ids This patch replaces NR_CPUS with nr_cpu_ids as NR_CPUS should consider cpumask_var_t. Signed-off-by: Himangi Saraogi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c6fd0f15425c..6724bd98da7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3461,8 +3461,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; rnp->grphi = (j + 1) * cpustride - 1; - if (rnp->grphi >= NR_CPUS) - rnp->grphi = NR_CPUS - 1; + if (rnp->grphi >= nr_cpu_ids) + rnp->grphi = nr_cpu_ids - 1; if (i == 0) { rnp->grpnum = 0; rnp->grpmask = 0; -- cgit v1.2.3 From 495aa969dbaef2e3d28094a2b3c752d069932748 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Tue, 18 Mar 2014 20:48:48 +0200 Subject: rcu: Consolidate kfree_call_rcu() to use rcu_state pointer kfree_call_rcu is defined two times. When defined under CONFIG_TREE_PREEMPT_RCU, it uses rcu_preempt_state. Otherwise, it uses rcu_sched_state. This patch uses the rcu_state_pointer to combine the two definitions into one. The resulting function is placed after the closing of the preprocessor conditional CONFIG_TREE_PREEMPT_RCU. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 14 ++++++++++++++ kernel/rcu/tree_plugin.h | 30 ------------------------------ 2 files changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6724bd98da7d..f3317c18b354 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2597,6 +2597,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, rcu_state, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /* * Because a context switch is a grace period for RCU-sched and RCU-bh, * any blocking grace-period wait automatically implies a grace period diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0cb0816036c5..4918a1243efb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -688,20 +688,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_preempt_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -1079,22 +1065,6 @@ static void rcu_preempt_check_callbacks(int cpu) { } -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - * - * Because there is no preemptible RCU, we use RCU-sched instead. - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. -- cgit v1.2.3 From a381d757d93f6e43063f74888676edd6216d0aff Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Wed, 19 Mar 2014 11:18:31 +0200 Subject: rcu: Merge rcu_sched_force_quiescent_state() with rcu_force_quiescent_state() This patch merges the function rcu_force_quiescent_state() with rcu_sched_force_quiescent_state(), using the rcu_state pointer. Firstly, the rcu_sched_force_quiescent_state() function is deleted from the file kernel/rcu/tree.c. Also, the rcu_force_quiescent_state() function that was calling force_quiescent_state with the argument rcu_preempt_state pointer was deleted as well. The new function that combines the old ones uses the rcu_state pointer and is located after rcu_batches_completed_bh() in kernel/rcu/tree.c. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 9 +++++++++ kernel/rcu/tree_plugin.h | 19 ------------------- 2 files changed, 9 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f3317c18b354..dbf43f5c86fa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,6 +270,15 @@ long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Force a quiescent state. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(rcu_state); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Force a quiescent state for RCU BH. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4918a1243efb..1af19e006899 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -148,15 +148,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -/* - * Force a quiescent state for preemptible RCU. - */ -void rcu_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -976,16 +967,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -/* - * Force a quiescent state for RCU, which, because there is no preemptible - * RCU, becomes the same as rcu-sched. - */ -void rcu_force_quiescent_state(void) -{ - rcu_sched_force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - /* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit v1.2.3 From 5057f55e543b7859cfd26bc281291795eac93f8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Apr 2014 11:20:36 -0700 Subject: rcu: Bind RCU grace-period kthreads if NO_HZ_FULL Currently, RCU binds the grace-period kthreads to the timekeeping CPU only if CONFIG_NO_HZ_FULL_SYSIDLE=y. This means that these kthreads must be bound manually when CONFIG_NO_HZ_FULL_SYSIDLE=n and CONFIG_NO_HZ_FULL=y: Otherwise, these kthreads will induce OS jitter on random CPUs. Given that we are trying to reduce the amount of manual tweaking required to make CONFIG_NO_HZ_FULL=y work nicely, this commit makes this binding happen when CONFIG_NO_HZ_FULL=y, even in cases where CONFIG_NO_HZ_FULL_SYSIDLE=n. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1af19e006899..045c6d04ed3f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2595,20 +2595,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return rsp == rcu_sysidle_state; } -/* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. - */ -static void rcu_bind_gp_kthread(void) -{ - int cpu = ACCESS_ONCE(tick_do_timer_cpu); - - if (cpu < 0 || cpu >= nr_cpu_ids) - return; - if (raw_smp_processor_id() != cpu) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - /* * Return a delay in jiffies based on the number of CPUs, rcu_node * leaf fanout, and jiffies tick rate. The idea is to allow larger @@ -2819,10 +2805,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return false; } -static void rcu_bind_gp_kthread(void) -{ -} - static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { @@ -2853,3 +2835,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) #endif /* #ifdef CONFIG_NO_HZ_FULL */ return 0; } + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} -- cgit v1.2.3 From becb41bfe0544f1f7f494f48d6f68cbdb2e1ed0e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Apr 2014 13:34:07 -0700 Subject: rcu: Make large and small sysidle systems use same state machine Currently, small systems move back into RCU_SYSIDLE_NOT from RCU_SYSIDLE_SHORT and large systems do not. This works because moving aggressively to RCU_SYSIDLE_NOT affects only performance, not correctness, and on small systems, the performance impact should be negligible. That said, this difference does make RCU a bit more complex, and RCU does not seem to be suffering from any lack of complexity. This commit therefore adjusts small-system operation to match that of large systems, so that the state never moves back to RCU_SYSIDLE_NOT from RCU_SYSIDLE_SHORT. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 045c6d04ed3f..f7593c2536b0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2659,7 +2659,8 @@ static void rcu_sysidle(unsigned long j) static void rcu_sysidle_cancel(void) { smp_mb(); - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; + if (full_sysidle_state > RCU_SYSIDLE_SHORT) + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; } /* -- cgit v1.2.3 From 8c96ae1dfa1608b92ddbf8bb285d7269832e4ac0 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Mon, 14 Apr 2014 10:25:43 -0700 Subject: rcu: Remove duplicate resched_cpu() declaration Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dbf43f5c86fa..ecd7e046de76 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -975,12 +975,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - static void print_cpu_stall(struct rcu_state *rsp) { int cpu; -- cgit v1.2.3 From fa07a58f71ee23a82597ce337126982d0cc60b72 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 15 Apr 2014 12:20:12 -0700 Subject: rcu: Replace __this_cpu_ptr() uses with raw_cpu_ptr() __this_cpu_ptr is being phased out. One special case is increment_cpu_stall_ticks(). A per cpu variable is incremented so use raw_cpu_inc(). Cc: Dipankar Sarma Signed-off-by: Christoph Lameter Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ecd7e046de76..d3e023e24c6f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2008,7 +2008,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2392,7 +2392,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; bool needwake; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -3066,7 +3066,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) static void rcu_barrier_func(void *type) { struct rcu_state *rsp = type; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f7593c2536b0..2e579c38bd91 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1809,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused) struct rcu_data *rdp; for_each_rcu_flavor(rsp) { - rdp = __this_cpu_ptr(rsp->rda); + rdp = raw_cpu_ptr(rsp->rda); if (rdp->qlen_lazy != 0) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); @@ -1951,7 +1951,7 @@ static void increment_cpu_stall_ticks(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - __this_cpu_ptr(rsp->rda)->ticks_this_gp++; + raw_cpu_inc(rsp->rda->ticks_this_gp); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ -- cgit v1.2.3 From a5d6d3a1b00a0ad88f07c3a727c79b27915278e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Apr 2014 09:06:24 -0700 Subject: softirq: A single rcu_bh_qs() call per softirq set is enough Calling rcu_bh_qs() after every softirq action is not really needed. What RCU needs is at least one rcu_bh_qs() per softirq round to note a quiescent state was passed for rcu_bh. Note for Paul and myself : this could be inlined as a single instruction and avoid smp_processor_id() (sone this_cpu_write(rcu_bh_data.passed_quiesce, 1)) Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/softirq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea0..b9b2d4906848 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void) bool in_hardirq; __u32 pending; int softirq_bit; - int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void) __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); - cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); @@ -276,11 +274,11 @@ restart: prev_count, preempt_count()); preempt_count_set(prev_count); } - rcu_bh_qs(cpu); h++; pending >>= softirq_bit; } + rcu_bh_qs(smp_processor_id()); local_irq_disable(); pending = local_softirq_pending(); -- cgit v1.2.3 From 52c324f8a87b336496d0f5e9d8dff1aa32bb08cd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:13:47 +0200 Subject: cpuidle: Combine cpuidle_enabled() with cpuidle_select() Since both cpuidle_enabled() and cpuidle_select() are only called by cpuidle_idle_call(), it is not really useful to keep them separate and combining them will help to avoid complicating cpuidle_idle_call() even further if governors are changed to return error codes sometimes. This code modification shouldn't lead to any functional changes. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 26 ++++++-------------------- include/linux/cpuidle.h | 5 ----- kernel/sched/idle.c | 20 +++++++------------- 3 files changed, 13 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8236746e46bb..f38359f64cc6 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -64,26 +64,6 @@ int cpuidle_play_dead(void) return -ENODEV; } -/** - * cpuidle_enabled - check if the cpuidle framework is ready - * @dev: cpuidle device for this cpu - * @drv: cpuidle driver for this cpu - * - * Return 0 on success, otherwise: - * -NODEV : the cpuidle framework is not available - * -EBUSY : the cpuidle framework is not initialized - */ -int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) -{ - if (off || !initialized) - return -ENODEV; - - if (!drv || !dev || !dev->enabled) - return -EBUSY; - - return 0; -} - /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -138,6 +118,12 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + if (off || !initialized) + return -ENODEV; + + if (!drv || !dev || !dev->enabled) + return -EBUSY; + return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index b0238cba440b..a8d5bd391a26 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -120,8 +120,6 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); -extern int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter(struct cpuidle_driver *drv, @@ -149,9 +147,6 @@ extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } -static inline int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev) -{return -ENODEV; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..a8f12247ce7c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -101,19 +101,13 @@ static int cpuidle_idle_call(void) rcu_idle_enter(); /* - * Check if the cpuidle framework is ready, otherwise fallback - * to the default arch specific idle method + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch specific idle method on errors. */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev); + ret = next_state; + if (ret >= 0) { /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get @@ -140,7 +134,7 @@ static int cpuidle_idle_call(void) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); - if (!ret) { + if (ret >= 0) { trace_cpu_idle_rcuidle(next_state, dev->cpu); /* @@ -175,7 +169,7 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) + if (ret < 0) arch_cpu_idle(); __current_set_polling(); -- cgit v1.2.3 From 2c730785d9532d2a9c46e059bd6a6c9a764c539f Mon Sep 17 00:00:00 2001 From: Sebastian Capella Date: Mon, 21 Apr 2014 17:30:46 -0700 Subject: PM / hibernate: no kernel_power_off when pm_power_off NULL Reboot logic in kernel/reboot will avoid calling kernel_power_off when pm_power_off is null, and instead uses kernel_halt. Change hibernate's power_down to follow the behavior in the reboot call. Calling the notifier twice (once for SYS_POWER_OFF and again for SYS_HALT) causes a panic during hibernation on Kirkwood Openblocks A6 board. Signed-off-by: Sebastian Capella Reported-by: Ezequiel Garcia Reviewed-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index de4b989cc8fd..1f08ac7f55d8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -599,7 +599,8 @@ static void power_down(void) case HIBERNATION_PLATFORM: hibernation_platform_enter(); case HIBERNATION_SHUTDOWN: - kernel_power_off(); + if (pm_power_off) + kernel_power_off(); break; #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: @@ -627,7 +628,8 @@ static void power_down(void) * corruption after resume. */ printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); + while (1) + cpu_relax(); } /** -- cgit v1.2.3 From 1e77d0a1ed7417d2a5a52a7b8d32aea1833faa6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 7 Mar 2013 14:53:45 +0100 Subject: genirq: Sanitize spurious interrupt detection of threaded irqs Till reported that the spurious interrupt detection of threaded interrupts is broken in two ways: - note_interrupt() is called for each action thread of a shared interrupt line. That's wrong as we are only interested whether none of the device drivers felt responsible for the interrupt, but by calling multiple times for a single interrupt line we account IRQ_NONE even if one of the drivers felt responsible. - note_interrupt() when called from the thread handler is not serialized. That leaves the members of irq_desc which are used for the spurious detection unprotected. To solve this we need to defer the spurious detection of a threaded interrupt to the next hardware interrupt context where we have implicit serialization. If note_interrupt is called with action_ret == IRQ_WAKE_THREAD, we check whether the previous interrupt requested a deferred check. If not, we request a deferred check for the next hardware interrupt and return. If set, we check whether one of the interrupt threads signaled success. Depending on this information we feed the result into the spurious detector. If one primary handler of a shared interrupt returns IRQ_HANDLED we disable the deferred check of irq threads on the same line, as we have found at least one device driver who cared. Reported-by: Till Straumann Signed-off-by: Thomas Gleixner Tested-by: Austin Schuh Cc: Oliver Hartkopp Cc: Wolfgang Grandegger Cc: Pavel Pisa Cc: Marc Kleine-Budde Cc: linux-can@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1303071450130.22263@ionos --- include/linux/irqdesc.h | 4 ++ kernel/irq/manage.c | 4 +- kernel/irq/spurious.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 26e2661d3935..472c021a2d4f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -27,6 +27,8 @@ struct irq_desc; * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts + * @threads_handled: stats field for deferred spurious detection of threaded handlers + * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers * @lock: locking for SMP * @affinity_hint: hint to user space for preferred irq affinity * @affinity_notify: context for notification of affinity changes @@ -52,6 +54,8 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; + atomic_t threads_handled; + int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; #ifdef CONFIG_SMP diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d34131ca372b..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,8 +886,8 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); + if (action_ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); wake_threads_waitq(desc); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, return action && (action->flags & IRQF_IRQPOLL); } +#define SPURIOUS_DEFERRED 0x80000000 + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irq_settings_is_polled(desc)) return; - /* we get here again via the threaded handler */ - if (action_ret == IRQ_WAKE_THREAD) - return; - if (bad_action_ret(action_ret)) { report_bad_irq(irq, desc, action_ret); return; } + /* + * We cannot call note_interrupt from the threaded handler + * because we need to look at the compound of all handlers + * (primary and threaded). Aside of that in the threaded + * shared case we have no serialization against an incoming + * hardware interrupt while we are dealing with a threaded + * result. + * + * So in case a thread is woken, we just note the fact and + * defer the analysis to the next hardware interrupt. + * + * The threaded handlers store whether they sucessfully + * handled an interrupt and we check whether that number + * changed versus the last invocation. + * + * We could handle all interrupts with the delayed by one + * mechanism, but for the non forced threaded case we'd just + * add pointless overhead to the straight hardirq interrupts + * for the sake of a few lines less code. + */ + if (action_ret & IRQ_WAKE_THREAD) { + /* + * There is a thread woken. Check whether one of the + * shared primary handlers returned IRQ_HANDLED. If + * not we defer the spurious detection to the next + * interrupt. + */ + if (action_ret == IRQ_WAKE_THREAD) { + int handled; + /* + * We use bit 31 of thread_handled_last to + * denote the deferred spurious detection + * active. No locking necessary as + * thread_handled_last is only accessed here + * and we have the guarantee that hard + * interrupts are not reentrant. + */ + if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { + desc->threads_handled_last |= SPURIOUS_DEFERRED; + return; + } + /* + * Check whether one of the threaded handlers + * returned IRQ_HANDLED since the last + * interrupt happened. + * + * For simplicity we just set bit 31, as it is + * set in threads_handled_last as well. So we + * avoid extra masking. And we really do not + * care about the high bits of the handled + * count. We just care about the count being + * different than the one we saw before. + */ + handled = atomic_read(&desc->threads_handled); + handled |= SPURIOUS_DEFERRED; + if (handled != desc->threads_handled_last) { + action_ret = IRQ_HANDLED; + /* + * Note: We keep the SPURIOUS_DEFERRED + * bit set. We are handling the + * previous invocation right now. + * Keep it for the current one, so the + * next hardware interrupt will + * account for it. + */ + desc->threads_handled_last = handled; + } else { + /* + * None of the threaded handlers felt + * responsible for the last interrupt + * + * We keep the SPURIOUS_DEFERRED bit + * set in threads_handled_last as we + * need to account for the current + * interrupt as well. + */ + action_ret = IRQ_NONE; + } + } else { + /* + * One of the primary handlers returned + * IRQ_HANDLED. So we don't care about the + * threaded handlers on the same line. Clear + * the deferred detection bit. + * + * In theory we could/should check whether the + * deferred bit is set and take the result of + * the previous run into account here as + * well. But it's really not worth the + * trouble. If every other interrupt is + * handled we never trigger the spurious + * detector. And if this is just the one out + * of 100k unhandled ones which is handled + * then we merily delay the spurious detection + * by one hard interrupt. Not a real problem. + */ + desc->threads_handled_last &= ~SPURIOUS_DEFERRED; + } + } + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by -- cgit v1.2.3 From 3cf2f34e1a3d4d5ff209d087925cf950e52f4805 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 2 May 2014 12:53:57 -0700 Subject: rwsem: Add comments to explain the meaning of the rwsem's count field It took me quite a while to understand how rwsem's count field mainifested itself in different scenarios. Add comments to provide a quick reference to the the rwsem's count field for each scenario where readers and writers are contending for the lock. Hopefully it will be useful for future maintenance of the code and for people to get up to speed on how the logic in the code works. Signed-off-by: Tim Chen Cc: Davidlohr Bueso Cc: Alex Shi Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: Paul E.McKenney Cc: Jason Low Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1399060437.2970.146.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1d66e08e897d..b4219ff87b8c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -11,6 +11,55 @@ #include #include +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X (1) X readers active or attempting lock, no writer waiting + * X = #active_readers + #readers attempting to lock + * (X*ACTIVE_BIAS) + * + * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or + * attempting to read lock or write lock. + * + * 0xffff000X (1) X readers active or attempting lock, with waiters for lock + * X = #active readers + # readers attempting lock + * (X*ACTIVE_BIAS + WAITING_BIAS) + * (2) 1 writer attempting lock, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * (3) 1 writer active, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock + * (WAITING_BIAS + ACTIVE_BIAS) + * (2) 1 writer active or attempting lock, no waiters for lock + * (ACTIVE_WRITE_BIAS) + * + * 0xffff0000 (1) There are writers or readers queued but none active + * or in the process of attempting lock. + * (WAITING_BIAS) + * Note: writer can attempt to steal lock for this count by adding + * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. + * (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + * the count becomes more than 0 for successful lock acquisition, + * i.e. the case where there are only readers or nobody has lock. + * (1st and 2nd case above). + * + * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + * checking the count becomes ACTIVE_WRITE_BIAS for successful lock + * acquisition (i.e. nobody else has lock or attempts lock). If + * unsuccessful, in rwsem_down_write_failed, we'll check to see if there + * are only waiters but none active (5th case above), and attempt to + * steal the lock. + * + */ + /* * Initialize an rwsem: */ -- cgit v1.2.3 From 3d7ee969bffcc984c8aeaffc6ac6816fd929ace1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:32 -0700 Subject: x86, vdso: Clean up 32-bit vs 64-bit vdso params Rather than using 'vdso_enabled' and an awful #define, just call the parameters vdso32_enabled and vdso64_enabled. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87913de56bdcbae3d93917938302fc369b05caee.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 20 +++++++++++++------- arch/x86/um/vdso/vma.c | 2 +- arch/x86/vdso/vdso32-setup.c | 19 ++++++++----------- arch/x86/vdso/vma.c | 6 +++--- kernel/sysctl.c | 5 +++++ 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2c71182d30ef..e96df2c0dd69 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #include -extern unsigned int vdso_enabled; +#ifdef CONFIG_X86_64 +extern unsigned int vdso64_enabled; +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern unsigned int vdso32_enabled; +#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -269,9 +274,9 @@ extern int force_personality32; struct task_struct; -#define ARCH_DLINFO_IA32(vdso_enabled) \ +#define ARCH_DLINFO_IA32 \ do { \ - if (vdso_enabled) { \ + if (vdso32_enabled) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -281,7 +286,7 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) +#define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -292,14 +297,15 @@ do { \ #define ARCH_DLINFO \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) +/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) @@ -310,7 +316,7 @@ do { \ if (test_thread_flag(TIF_X32)) \ ARCH_DLINFO_X32; \ else \ - ARCH_DLINFO_IA32(sysctl_vsyscall32) + ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index af91901babb8..916cda4cd5b4 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,7 +12,7 @@ #include #include -unsigned int __read_mostly vdso_enabled = 1; +static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; extern unsigned long task_size; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a6..5a657d93c6e0 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -37,7 +37,6 @@ #endif #ifdef CONFIG_X86_64 -#define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages #endif @@ -45,13 +44,13 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; -static int __init vdso_setup(char *s) +static int __init vdso32_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso_enabled > 1) + if (vdso32_enabled > 1) pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); return 1; @@ -62,12 +61,10 @@ static int __init vdso_setup(char *s) * behavior on both 64-bit and 32-bit kernels. * On 32-bit kernels, vdso=[012] means the same thing. */ -__setup("vdso32=", vdso_setup); +__setup("vdso32=", vdso32_setup); #ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso32_setup, vdso_setup, 0); - -EXPORT_SYMBOL_GPL(vdso_enabled); +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif static struct page **vdso32_pages; @@ -160,7 +157,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled != 1) /* Other values all mean "disabled" */ + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); @@ -244,7 +241,7 @@ subsys_initcall(sysenter_setup); static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", - .data = &sysctl_vsyscall32, + .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1ad102613127..8b790398ed1d 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -17,7 +17,7 @@ #include #if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso_enabled = 1; +unsigned int __read_mostly vdso64_enabled = 1; DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; @@ -160,7 +160,7 @@ static int setup_additional_pages(struct linux_binprm *bprm, unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso64_enabled) return 0; down_write(&mm->mmap_sem); @@ -203,7 +203,7 @@ int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) static __init int vdso_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso64_enabled = simple_strtoul(s, NULL, 0); return 0; } __setup("vdso=", vdso_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..420d77afa8fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, -- cgit v1.2.3 From a6220fc19afc07fe77cfd16f5b8e568615517091 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 5 May 2014 00:51:54 +0200 Subject: PM / suspend: Always use deepest C-state in the "freeze" sleep state If freeze_enter() is called, we want to bypass the current cpuidle governor and always use the deepest available (that is, not disabled) C-state, because we want to save as much energy as reasonably possible then and runtime latency constraints don't matter at that point, since the system is in a sleep state anyway. Signed-off-by: Rafael J. Wysocki Tested-by: Aubrey Li --- drivers/cpuidle/cpuidle.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/cpuidle.h | 2 ++ kernel/power/suspend.c | 2 ++ 3 files changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f38359f64cc6..cb7019977c50 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices); static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +static bool use_deepest_state __read_mostly; int cpuidle_disabled(void) { @@ -64,6 +65,45 @@ int cpuidle_play_dead(void) return -ENODEV; } +/** + * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. + * @enable: Whether enable or disable the feature. + * + * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and + * always use the state with the greatest exit latency (out of the states that + * are not disabled). + * + * This function can only be called after cpuidle_pause() to avoid races. + */ +void cpuidle_use_deepest_state(bool enable) +{ + use_deepest_state = enable; +} + +/** + * cpuidle_find_deepest_state - Find the state of the greatest exit latency. + * @drv: cpuidle driver for a given CPU. + * @dev: cpuidle device for a given CPU. + */ +static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + unsigned int latency_req = 0; + int i, ret = CPUIDLE_DRIVER_STATE_START - 1; + + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable || s->exit_latency <= latency_req) + continue; + + latency_req = s->exit_latency; + ret = i; + } + return ret; +} + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -124,6 +164,9 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (!drv || !dev || !dev->enabled) return -EBUSY; + if (unlikely(use_deepest_state)) + return cpuidle_find_deepest_state(drv, dev); + return cpuidle_curr_governor->select(drv, dev); } @@ -155,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ void cpuidle_reflect(struct cpuidle_device *dev, int index) { - if (cpuidle_curr_governor->reflect) + if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) cpuidle_curr_governor->reflect(dev, index); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a8d5bd391a26..c51a436135c4 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -143,6 +143,7 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); +extern void cpuidle_use_deepest_state(bool enable); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -175,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) {} static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..155721f7f909 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -54,9 +54,11 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_use_deepest_state(true); cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); cpuidle_pause(); + cpuidle_use_deepest_state(false); } void freeze_wake(void) -- cgit v1.2.3 From 792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:27 -0400 Subject: sched/numa: Count pages on active node as local The NUMA code is smart enough to distribute the memory of workloads that span multiple NUMA nodes across those NUMA nodes. However, it still has a pretty high scan rate for such workloads, because any memory that is left on a node other than the node of the CPU that faulted on the memory is counted as non-local, which causes the scan rate to go up. Counting the memory on any node where the task's numa group is actively running as local, allows the scan rate to slow down once the application is settled in. This should reduce the overhead of the automatic NUMA placement code, when a workload spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d859ec975c2..f6457b63c95c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- cgit v1.2.3 From 5085e2a328849bdee6650b32d52c87c3788ab01c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:28 -0400 Subject: sched/numa: Retry placement more frequently when misplaced When tasks have not converged on their preferred nodes yet, we want to retry fairly often, to make sure we do not migrate a task's memory to an undesirable location, only to have to move it again later. This patch reduces the interval at which migration is retried, when the task's numa_scan_period is small. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6457b63c95c..ecea8d9f957c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) -- cgit v1.2.3 From 68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:29 -0400 Subject: sched/numa: Do not set preferred_node on migration to a second choice node Setting the numa_preferred_node for a task in task_numa_migrate does nothing on a 2-node system. Either we migrate to the node that already was our preferred node, or we stay where we were. On a 4-node system, it can slightly decrease overhead, by not calling the NUMA code as much. Since every node tends to be directly connected to every other node, running on the wrong node for a while does not do much damage. However, on an 8 node system, there are far more bad nodes than there are good ones, and pretending that a second choice is actually the preferred node can greatly delay, or even prevent, a workload from converging. The only time we can safely pretend that a second choice node is the preferred node is when the task is part of a workload that spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecea8d9f957c..051903f33eec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an -- cgit v1.2.3 From 143e1e28cb40bed836b0a06567208bd7347c9672 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:37 +0200 Subject: sched: Rework sched_domain topology definition We replace the old way to configure the scheduler topology with a new method which enables a platform to declare additionnal level (if needed). We still have a default topology table definition that can be used by platform that don't want more level than the SMT, MC, CPU and NUMA ones. This table can be overwritten by an arch which either wants to add new level where a load balance make sense like BOOK or powergating level or wants to change the flags configuration of some levels. For each level, we need a function pointer that returns cpumask for each cpu, a function pointer that returns the flags for the level and a name. Only flags that describe topology, can be set by an architecture. The current topology flags are: SD_SHARE_CPUPOWER SD_SHARE_PKG_RESOURCES SD_NUMA SD_ASYM_PACKING Then, each level must be a subset on the next one. The build sequence of the sched_domain will take care of removing useless levels like those with 1 CPU and those with the same CPU span and no more relevant information for load balancing than its children. Signed-off-by: Vincent Guittot Tested-by: Dietmar Eggemann Reviewed-by: Preeti U Murthy Reviewed-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Chris Metcalf Cc: Christoph Lameter Cc: David S. Miller Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Hanjun Guo Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Tony Luck Cc: linux390@de.ibm.com Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 24 ---- arch/s390/include/asm/topology.h | 2 - arch/tile/include/asm/topology.h | 33 ------ include/linux/sched.h | 53 +++++++++ include/linux/topology.h | 128 +++------------------ kernel/sched/core.c | 233 ++++++++++++++++++++------------------- 6 files changed, 186 insertions(+), 287 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 5cb55a1e606b..3202aa74e0d6 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -46,30 +46,6 @@ void build_cpu_to_node_map(void); -#define SD_CPU_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 05425b18c0aa..07763bdb408d 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -64,8 +64,6 @@ static inline void s390_init_cpu_topology(void) }; #endif -#define SD_BOOK_INIT SD_CPU_INIT - #include #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d15c0d8d550f..938311844233 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node) /* For now, use numa node -1 for global allocation. */ #define pcibus_to_node(bus) ((void)(bus), -1) -/* - * TILE architecture has many cores integrated in one processor, so we need - * setup bigger balance_interval for both CPU/NODE scheduling domains to - * reduce process scheduling costs. - */ - -/* sched_domains SD_CPU_INIT for TILE architecture */ -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 4, \ - .max_interval = 128, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 32, \ -} - /* By definition, we create nodes based on online memory. */ #define node_has_online_mem(nid) 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a4298fb0d85..656b035c30e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -879,6 +879,27 @@ enum cpu_idle_type { extern int __weak arch_sd_sibiling_asym_packing(void); +#ifdef CONFIG_SCHED_SMT +static inline const int cpu_smt_flags(void) +{ + return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_SCHED_MC +static inline const int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline const int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif + struct sched_domain_attr { int relax_domain_level; }; @@ -985,6 +1006,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern struct sched_domain_topology_level *sched_domain_topology; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7062330a1329..973671ff9e7d 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -66,121 +66,6 @@ int arch_update_cpu_topology(void); #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif -/* - * Below are the 3 major initializers used in building sched_domains: - * SD_SIBLING_INIT, for SMT domains - * SD_CPU_INIT, for SMP domains - * - * Any architecture that cares to do any tuning to these values should do so - * by defining their own arch-specific initializer in include/asm/topology.h. - * A definition there will automagically override these default initializers - * and allow arch-specific performance tuning of sched_domains. - * (Only non-zero and non-null fields need be specified.) - */ - -#ifdef CONFIG_SCHED_SMT -/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, - * so can't we drop this in favor of CONFIG_SCHED_SMT? - */ -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 1*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - | arch_sd_sibling_asym_packing() \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_SMT */ - -#ifdef CONFIG_SCHED_MC -/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ -#ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_MC */ - -/* Common values for CPUs */ -#ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 1*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif - -#ifdef CONFIG_SCHED_BOOK -#ifndef SD_BOOK_INIT -#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! -#endif -#endif /* CONFIG_SCHED_BOOK */ - #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); @@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu) #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifdef CONFIG_SCHED_SMT +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +static inline const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + + #endif /* _LINUX_TOPOLOGY_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1cccfc..7d332b7899cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5566,17 +5566,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5589,21 +5578,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5832,34 +5806,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5947,99 +5893,156 @@ static void claim_allocations(int cpu, struct sched_domain *sd) *per_cpu_ptr(sdd->sgp, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) - #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUPOWER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE + | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUPOWER) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + sd->flags |= arch_sd_sibling_asym_packing(); + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +#ifdef CONFIG_SCHED_BOOK + { cpu_book_mask, SD_INIT_NAME(BOOK) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6183,7 +6186,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6191,18 +6197,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6360,7 +6367,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; -- cgit v1.2.3 From 2dfd747629e65f89d2dbceb92fffc763f66228b2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:38 +0200 Subject: sched, s390: Create a dedicated topology table BOOK level is only relevant for s390 so we create a dedicated topology table with BOOK level and remove it from default table. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Philipp Hachtmann Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: dietmar.eggemann@arm.com Cc: preeti@linux.vnet.ibm.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: linux390@de.ibm.com Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/s390/include/asm/topology.h | 11 +---------- arch/s390/kernel/topology.c | 20 ++++++++++++++++++++ kernel/sched/core.c | 3 --- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 07763bdb408d..56af53093d24 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; #define mc_capable() 1 -static inline const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_mask; -} - -static inline const struct cpumask *cpu_book_mask(int cpu) -{ - return &cpu_topology[cpu].book_mask; -} - int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); void topology_expect_change(void); +const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_BOOK */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 6298fed11ced..1860ad3cbc04 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -443,6 +443,23 @@ int topology_cpu_init(struct cpu *cpu) return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); } +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int __init topology_init(void) { if (!MACHINE_HAS_TOPOLOGY) { @@ -451,6 +468,9 @@ static int __init topology_init(void) } set_topology_timer(); out: + + set_sched_topology(s390_topology); + return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d332b7899cc..e59e5aec745a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6023,9 +6023,6 @@ static struct sched_domain_topology_level default_topology[] = { #endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif -#ifdef CONFIG_SCHED_BOOK - { cpu_book_mask, SD_INIT_NAME(BOOK) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, -- cgit v1.2.3 From 607b45e9a216e89a63351556e488eea06be0ff48 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:39 +0200 Subject: sched, powerpc: Create a dedicated topology table Create a dedicated topology table for handling asymetric feature of powerpc. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U. Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: dietmar.eggemann@arm.com Cc: devicetree@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1397209481-28542-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/smp.c | 31 +++++++++++++++++++++++-------- include/linux/sched.h | 2 -- kernel/sched/core.c | 6 ------ 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e2a4232c5871..10ffffef0414 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static const int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -} + set_sched_topology(powerpc_topology); -int arch_sd_sibling_asym_packing(void) -{ - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - return SD_ASYM_PACKING; - } - return 0; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/include/linux/sched.h b/include/linux/sched.h index 656b035c30e5..439a153b8403 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -877,8 +877,6 @@ enum cpu_idle_type { #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ -extern int __weak arch_sd_sibiling_asym_packing(void); - #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e59e5aec745a..7e348e238bf1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5796,11 +5796,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -5981,7 +5976,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) if (sd->flags & SD_SHARE_CPUPOWER) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ - sd->flags |= arch_sd_sibling_asym_packing(); } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit v1.2.3 From d77b3ed5c9f8ebedf154b52b5e943c461f3d37e6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:40 +0200 Subject: sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain A new flag SD_SHARE_POWERDOMAIN is created to reflect whether groups of CPUs in a sched_domain level can or not reach different power state. As an example, the flag should be cleared at CPU level if groups of cores can be power gated independently. This information can be used in the load balance decision or to add load balancing level between group of CPUs that can power gate independantly. This flag is part of the topology flags that can be set by arch. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1397209481-28542-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 439a153b8403..accb66bfd722 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,6 +870,7 @@ enum cpu_idle_type { #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e348e238bf1..1c9c3b7b26af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5261,7 +5261,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5292,7 +5293,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5901,6 +5903,7 @@ static int sched_domains_curr_level; * SD_SHARE_CPUPOWER - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -5909,7 +5912,8 @@ static int sched_domains_curr_level; (SD_SHARE_CPUPOWER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING) + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) -- cgit v1.2.3 From 39a4d9ca77a31503c6317e49742341d0859d5cb2 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:35 -0700 Subject: sched/fair: Stop searching for tasks in newidle balance if there are runnable tasks It was found that when running some workloads (such as AIM7) on large systems with many cores, CPUs do not remain idle for long. Thus, tasks can wake/get enqueued while doing idle balancing. In this patch, while traversing the domains in idle balance, in addition to checking for pulled_task, we add an extra check for this_rq->nr_running for determining if we should stop searching for tasks to pull. If there are runnable tasks on this rq, then we will stop traversing the domains. This reduces the chance that idle balance delays a task from running. This patch resulted in approximately a 6% performance improvement when running a Java Server workload on an 8 socket machine. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1398303035-18255-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 051903f33eec..28ccf502c63c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6713,7 +6713,6 @@ static int idle_balance(struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6728,7 +6727,12 @@ static int idle_balance(struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; } rcu_read_unlock(); -- cgit v1.2.3 From 1f4ee5038f0c1ef95f8e6d47ad6623e006b5bce1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 May 2014 09:59:34 +0200 Subject: perf: Ensure consistent inherit state in groups Make sure all events in a group have the same inherit state. It was possible for group leaders to have inherit set while sibling events would not have inherit set. In this case we'd still inherit the siblings, leading to some non-fatal weirdness. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-r32tt8yldvic3jlcghd3g35u@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 09866a330af8..1de0d709f69f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7081,20 +7081,26 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (task && group_leader && + group_leader->attr.inherit != attr.inherit) { + err = -EINVAL; + goto err_task; + } + get_online_cpus(); event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_task; + goto err_cpus; } if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) { __free_event(event); - goto err_task; + goto err_cpus; } } @@ -7256,8 +7262,9 @@ err_context: put_ctx(ctx); err_alloc: free_event(event); -err_task: +err_cpus: put_online_cpus(); +err_task: if (task) put_task_struct(task); err_group_fd: -- cgit v1.2.3 From 15a2d4de0eab533a76bee9e68d7e1063dd25401c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:41:02 +0200 Subject: perf: Always destroy groups on exit Commit 38b435b16c36 ("perf: Fix tear-down of inherited group events") states that we need to destroy groups for inherited events, but it doesn't make any sense to not also destroy groups for normal events. And while it usually makes no difference (the normal events won't leak, and its very likely all the group events will die in quick succession) it does make the code more consistent and closes a potential hole for trouble. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-426egt8zmsm12d2q8k2xz4tt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1de0d709f69f..819ffc006d67 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7400,7 +7400,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - perf_remove_from_context(child_event, !!child_event->parent); + perf_remove_from_context(child_event, true); /* * It can happen that the parent exits first, and has events -- cgit v1.2.3 From 63342411efd2d9350ad405205da036cd45ed1640 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:49:16 +0200 Subject: perf: Validate locking assumption Document and validate the locking assumption of event_sched_in(). Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sybq1publ9xt5no77cwvi0eo@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 819ffc006d67..0de199729f04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1678,6 +1678,8 @@ event_sched_in(struct perf_event *event, u64 tstamp = perf_event_time(event); int ret = 0; + lockdep_assert_held(&ctx->lock); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; -- cgit v1.2.3 From 683ede43dd412c6cad0d23578a018409ac9c683e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:11:24 +0200 Subject: perf: Rework free paths Primarily make perf_event_release_kernel() into put_event(), this will allow kernel space to create per-task inherited events, and is safer in general. Also, document the free_event() assumptions. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-rk9pvr6e1d0559lxstltbztc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 66 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0de199729f04..83505a35afba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3251,7 +3251,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3279,42 +3280,31 @@ static void free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); - - free_event(event); + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } - return 0; + _free_event(event); } -EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static void put_event(struct perf_event *event) { + struct perf_event_context *ctx = event->ctx; struct task_struct *owner; if (!atomic_long_dec_and_test(&event->refcount)) @@ -3353,9 +3343,33 @@ static void put_event(struct perf_event *event) put_task_struct(owner); } - perf_event_release_kernel(event); + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + perf_remove_from_context(event, true); + mutex_unlock(&ctx->mutex); + + _free_event(event); } +int perf_event_release_kernel(struct perf_event *event) +{ + put_event(event); + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + static int perf_release(struct inode *inode, struct file *file) { put_event(file->private_data); -- cgit v1.2.3 From 3a497f48637e2aac17eabb84a17f8ac5216028fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:42:29 +0200 Subject: perf: Simplify perf_event_exit_task_context() Instead of jumping through hoops to make sure to find (and exit) each event, do it the simple straight fwd way. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-tij931199thfkys8vbnokdpf@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 83505a35afba..7ab734fbaeeb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7485,24 +7485,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) + list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); -- cgit v1.2.3 From fd99f91aa007ba255aac44fe6cf21c1db398243a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 15:35:08 +0200 Subject: sched/idle: Avoid spurious wakeup IPIs Because mwait_idle_with_hints() gets called from !idle context it must call current_clr_polling(). This however means that resched_task() is very likely to send an IPI even when we were polling: CPU0 CPU1 if (current_set_polling_and_test()) goto out; __monitor(&ti->flags); if (!need_resched()) __mwait(eax, ecx); set_tsk_need_resched(p); smp_mb(); out: current_clr_polling(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); So while it is correct (extra IPIs aren't a problem, whereas a missed IPI would be) it is a performance problem (for some). Avoid this issue by using fetch_or() to atomically set NEED_RESCHED and test if POLLING_NRFLAG is set. Since a CPU stuck in mwait is unlikely to modify the flags word, contention on the cmpxchg is unlikely and thus we should mostly succeed in a single go. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kf5suce6njh5xf5d3od13rr0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1c9c3b7b26af..4b82622b6252 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,6 +505,39 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -521,17 +554,15 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); } -- cgit v1.2.3 From c444117f0f39d59733ec23da67c44424df529230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:47:16 +0200 Subject: sched/idle: Delay clearing the polling bit With the generic idle functions assuming !polling we should only clear the polling bit at the very last opportunity in order to avoid spurious IPIs. Ideally we'd flip the default to polling, but that means auditing all arch idle functions. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-vq7719foqzf6z5h4j7eh7f9e@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..ed67f0cd2906 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,12 +78,10 @@ static int cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq and - * set again the polling flag + * case, exit the function after re-enabling the local irq. */ - if (current_clr_polling_and_test()) { + if (need_resched()) { local_irq_enable(); - __current_set_polling(); return 0; } @@ -127,7 +125,7 @@ static int cpuidle_idle_call(void) broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); - if (broadcast) + if (broadcast) { /* * Tell the time framework to switch * to a broadcast timer because our @@ -139,6 +137,7 @@ static int cpuidle_idle_call(void) ret = clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); + } if (!ret) { trace_cpu_idle_rcuidle(next_state, dev->cpu); @@ -175,8 +174,12 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) - arch_cpu_idle(); + if (ret) { + if (!current_clr_polling_and_test()) + arch_cpu_idle(); + else + local_irq_enable(); + } __current_set_polling(); -- cgit v1.2.3 From 37352273ad48f2d177ed1b06ced32d5536b773fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:55:48 +0200 Subject: sched/idle: Reflow cpuidle_idle_call() Apply goto to reduce lines and nesting levels. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-cc6vb0snt3sr7op6rlbfeqfh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 131 +++++++++++++++++++++++----------------------------- 1 file changed, 58 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ed67f0cd2906..88a6bc43738b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -73,7 +73,7 @@ static int cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); - int next_state, entered_state, ret; + int next_state, entered_state; bool broadcast; /* @@ -102,90 +102,75 @@ static int cpuidle_idle_call(void) * Check if the cpuidle framework is ready, otherwise fallback * to the default arch specific idle method */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); - + if (cpuidle_enabled(drv, dev)) { +use_default: /* - * The idle task must be scheduled, it is pointless to - * go to idle, just update no idle residency and get - * out of this function + * We can't use the cpuidle framework, let's use the default + * idle routine. */ - if (current_clr_polling_and_test()) { - dev->last_residency = 0; - entered_state = next_state; + if (current_clr_polling_and_test()) local_irq_enable(); - } else { - broadcast = !!(drv->states[next_state].flags & - CPUIDLE_FLAG_TIMER_STOP); - - if (broadcast) { - /* - * Tell the time framework to switch - * to a broadcast timer because our - * local timer will be shutdown. If a - * local timer is used from another - * cpu as a broadcast timer, this call - * may fail if it is not available - */ - ret = clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_ENTER, - &dev->cpu); - } - - if (!ret) { - trace_cpu_idle_rcuidle(next_state, dev->cpu); - - /* - * Enter the idle state previously - * returned by the governor - * decision. This function will block - * until an interrupt occurs and will - * take care of re-enabling the local - * interrupts - */ - entered_state = cpuidle_enter(drv, dev, - next_state); - - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, - dev->cpu); - - if (broadcast) - clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_EXIT, - &dev->cpu); - - /* - * Give the governor an opportunity to reflect on the - * outcome - */ - cpuidle_reflect(dev, entered_state); - } - } + else + arch_cpu_idle(); + + goto exit_idle; } /* - * We can't use the cpuidle framework, let's use the default - * idle routine + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state */ - if (ret) { - if (!current_clr_polling_and_test()) - arch_cpu_idle(); - else - local_irq_enable(); + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + goto exit_idle; } + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + /* + * Tell the time framework to switch to a broadcast timer + * because our local timer will be shutdown. If a local timer + * is used from another cpu as a broadcast timer, this call may + * fail if it is not available + */ + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + goto use_default; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the outcome + */ + cpuidle_reflect(dev, entered_state); + +exit_idle: __current_set_polling(); /* - * It is up to the idle functions to enable back the local - * interrupt + * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); -- cgit v1.2.3 From 08c373e5123b4595588ae1a7aa7e00a046c61cc6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Apr 2014 01:26:58 +0200 Subject: sched/idle: Make cpuidle_idle_call() void The only value ever returned by cpuidle_idle_call() is 0 and its only caller ignores that value anyway, so make it void. Signed-off-by: Rafael J. Wysocki Cc: Daniel Lezcano Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4717784.WmVEpDoliM@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 88a6bc43738b..34083c9ac976 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,9 +67,8 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here - * return non-zero on failure */ -static int cpuidle_idle_call(void) +static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -82,7 +81,7 @@ static int cpuidle_idle_call(void) */ if (need_resched()) { local_irq_enable(); - return 0; + return; } /* @@ -177,8 +176,6 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); - - return 0; } /* -- cgit v1.2.3 From 317cf7e5e85e3ef9f23fc6dd8b2945ab4a258140 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 9 May 2014 23:32:08 +0200 Subject: PM / hibernate: convert simple_strtoul to kstrtoul Replace obsolete function. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f08ac7f55d8..2377ff72994c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1115,7 +1115,10 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - resume_delay = simple_strtoul(str, NULL, 0); + int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + + if (rc) + return rc; return 1; } -- cgit v1.2.3 From cdafb93feb3d0ae3131f631a10f70954c96cbef8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 9 May 2014 20:32:25 +0200 Subject: ntp: Convert simple_strtol to kstrtol Replace obsolete function simple_strtol w/ kstrtol Inspired-By: Andrew Morton Cc: John Stultz Cc: Andrew Morton Signed-off-by: Fabian Frederick [jstultz: Tweak commit message] Signed-off-by: John Stultz --- kernel/time/ntp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..82b7c9edba7e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -923,7 +923,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; -- cgit v1.2.3 From ea54bca3aab3468daf1c37d15047ee66bca8760d Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Mon, 12 May 2014 09:35:48 -0400 Subject: ntp: Make is_error_status() use its argument is_error_status() is an inline function always called with the global time_status as an argument, so there's zero functional difference with this change, but the non-CONFIG_NTP_PPS version uses the passed-in argument, while the CONFIG_NTP_PPS one ignores its argument and uses the global. Looks like is_error_status was refactored out, but someone forgot to change the logic to check the local argument value. Thus this patch makes it use the argument always; shorter variable names are good. Signed-off-by: George Spelvin [jstultz: Tweaked commit message] Signed-off-by: John Stultz --- kernel/time/ntp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 82b7c9edba7e..c8780cdaf852 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) -- cgit v1.2.3 From ad0dc7f94dbf417b1c7d42e1f0b250f045b27f8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Feb 2014 10:51:42 -0800 Subject: rcutorture: Add forward-progress checking for writer The rcutorture output currently does not distinguish between stalls in the RCU implementation and stalls in the rcu_torture_writer() kthreads. This commit therefore adds some diagnostics to help distinguish between these two conditions, at least for the non-SRCU implementations. (SRCU does not provide evidence of update-side forward progress by design.) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 19 +++++++++++++++++++ kernel/rcu/rcutorture.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcu/tree.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 00a7fd61b3c6..82973738125b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,7 +51,17 @@ extern int rcu_expedited; /* for sysctl */ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, @@ -60,6 +70,15 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c_old, unsigned long c); #else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} static inline void rcutorture_record_test_transition(void) { } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bd30bc61bc05..0d739e3797e3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -138,6 +138,15 @@ static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; +static int rcu_torture_writer_state; +#define RTWS_FIXED_DELAY 0 +#define RTWS_DELAY 1 +#define RTWS_REPLACE 2 +#define RTWS_DEF_FREE 3 +#define RTWS_EXP_SYNC 4 +#define RTWS_STUTTER 5 +#define RTWS_STOPPING 6 + #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 #else @@ -214,6 +223,7 @@ rcu_torture_free(struct rcu_torture *p) */ struct rcu_torture_ops { + int ttype; void (*init)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); @@ -312,6 +322,7 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_ops = { + .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, @@ -355,6 +366,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { + .ttype = RCU_BH_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -397,6 +409,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } static struct rcu_torture_ops rcu_busted_ops = { + .ttype = INVALID_RCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -492,6 +505,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_ops = { + .ttype = SRCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, @@ -527,6 +541,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops sched_ops = { + .ttype = RCU_SCHED_FLAVOR, .init = rcu_sync_torture_init, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -699,12 +714,15 @@ rcu_torture_writer(void *arg) set_user_nice(current, MAX_NICE); do { + rcu_torture_writer_state = RTWS_FIXED_DELAY; schedule_timeout_uninterruptible(1); rp = rcu_torture_alloc(); if (rp == NULL) continue; rp->rtort_pipe_count = 0; + rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); + rcu_torture_writer_state = RTWS_REPLACE; old_rp = rcu_dereference_check(rcu_torture_current, current == writer_task); rp->rtort_mbtest = 1; @@ -721,8 +739,10 @@ rcu_torture_writer(void *arg) else exp = gp_exp; if (!exp) { + rcu_torture_writer_state = RTWS_DEF_FREE; cur_ops->deferred_free(old_rp); } else { + rcu_torture_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); list_add(&old_rp->rtort_free, &rcu_torture_removed); @@ -743,8 +763,10 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); + rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -937,6 +959,7 @@ rcu_torture_printk(char *page) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + static unsigned long rtcv_snap = ULONG_MAX; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -997,6 +1020,20 @@ rcu_torture_printk(char *page) page += sprintf(page, "\n"); if (cur_ops->stats) cur_ops->stats(page); + if (rtcv_snap == rcu_torture_current_version && + rcu_torture_current != NULL) { + int __maybe_unused flags; + unsigned long __maybe_unused gpnum; + unsigned long __maybe_unused completed; + + rcutorture_get_gp_data(cur_ops->ttype, + &flags, &gpnum, &completed); + page += sprintf(page, + "??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); + } + rtcv_snap = rcu_torture_current_version; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..3d15b5a82ae8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -293,6 +293,39 @@ void rcutorture_record_test_transition(void) } EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); +/* + * Send along grace-period-related data for rcutorture diagnostics. + */ +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + struct rcu_state *rsp = NULL; + + switch (test_type) { + case RCU_FLAVOR: + rsp = rcu_state; + break; + case RCU_BH_FLAVOR: + rsp = &rcu_bh_state; + break; + case RCU_SCHED_FLAVOR: + rsp = &rcu_sched_state; + break; + default: + break; + } + if (rsp != NULL) { + *flags = ACCESS_ONCE(rsp->gp_flags); + *gpnum = ACCESS_ONCE(rsp->gpnum); + *completed = ACCESS_ONCE(rsp->completed); + return; + } + *flags = 0; + *gpnum = 0; + *completed = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); + /* * Record the number of writer passes through the current rcutorture test. * This is also used to correlate debugfs tracing stats with the rcutorture -- cgit v1.2.3 From da601c63fd3a3e6c30f85eefd5ee46397b5b439d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Feb 2014 12:14:51 -0800 Subject: torture: Intensify locking test The current lock_torture_writer() spends too much time sleeping and not enough time hammering locks, as in an eight-CPU test will often only be utilizing a CPU or two. This commit therefore makes lock_torture_writer() sleep less and hammer more. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..b0d3e3c50672 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -219,7 +219,8 @@ static int lock_torture_writer(void *arg) set_user_nice(current, 19); do { - schedule_timeout_uninterruptible(1); + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_write_lock_fail++; -- cgit v1.2.3 From b3b8a4d42bba8e1fb1b91cc6fd53829d28a503de Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:16:57 +0530 Subject: rcutorture: Mark function as static in kernel/rcu/torture.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in kernel/rcu/torture.c because they are not used outside this file. This eliminates the following warning in kernel/rcu/torture.c: kernel/rcu/torture.c:902:6: warning: no previous prototype for ‘rcutorture_trace_dump’ [-Wmissing-prototypes] kernel/rcu/torture.c:1572:6: warning: no previous prototype for ‘rcu_torture_barrier_cbf’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d739e3797e3..0bc5f0a4c1ab 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -806,7 +806,7 @@ rcu_torture_fakewriter(void *arg) return 0; } -void rcutorture_trace_dump(void) +static void rcutorture_trace_dump(void) { static atomic_t beenhere = ATOMIC_INIT(0); @@ -1183,7 +1183,7 @@ static int __init rcu_torture_stall_init(void) } /* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) +static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { atomic_inc(&barrier_cbs_invoked); } -- cgit v1.2.3 From 589a8f59509dc4c68f9c1e2522c5b0b556009221 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Mar 2014 16:51:08 -0800 Subject: rcutorture: Print negatives for SRCU counter wraparound The srcu_torture_stats() function prints SRCU's per-CPU c[] array with an unsigned format, which means that the number one less than zero is a very large number. This commit therefore prints this array with a signed format in order to improve readability of the rcutorture output. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0bc5f0a4c1ab..80d2d2440210 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -492,9 +492,11 @@ static void srcu_torture_stats(char *page) page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - page += sprintf(page, " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + long c0, c1; + + c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; + c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; + page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); } sprintf(page, "\n"); } -- cgit v1.2.3 From 0d6821d5f70b7137974575758962bae61ed0fc63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Mar 2014 16:58:03 -0800 Subject: torture: Include "Stopping" string to torture_kthread_stopping() Currently, torture_kthread_stopping() prints only the name of the kthread that is stopping, which can be unedifying. This commit therefore adds "Stopping" to make things more evident. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index acc9afc2f26e..e5af6be2594d 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -674,8 +674,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq); */ void torture_kthread_stopping(char *title) { - if (verbose) - VERBOSE_TOROUT_STRING(title); + char buf[128]; + + snprintf(buf, sizeof(buf), "Stopping %s", title); + VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); schedule_timeout_uninterruptible(1); -- cgit v1.2.3 From ab7d45053f99f44f81a221eb5c9fbe253ee94524 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Mar 2014 11:03:21 -0800 Subject: torture: Increase stutter-end intensity Currently, all stuttered kthreads block a jiffy at a time, which can result in them starting at different times. (Note: This is not an energy-efficiency problem unless you run torture tests in production, in which case you have other problems!) This commit increases the intensity of the restart event by causing kthreads to spin through the last jiffy, restarting when they see the variable change. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index e5af6be2594d..bc0ee382b3c8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -533,7 +533,11 @@ void stutter_wait(const char *title) while (ACCESS_ONCE(stutter_pause_test) || (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { if (stutter_pause_test) - schedule_timeout_interruptible(1); + if (ACCESS_ONCE(stutter_pause_test) == 1) + schedule_timeout_interruptible(1); + else + while (ACCESS_ONCE(stutter_pause_test)) + cond_resched(); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); torture_shutdown_absorb(title); @@ -550,7 +554,11 @@ static int torture_stutter(void *arg) VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop()) { - schedule_timeout_interruptible(stutter); + if (stutter > 1) { + schedule_timeout_interruptible(stutter - 1); + ACCESS_ONCE(stutter_pause_test) = 2; + } + schedule_timeout_interruptible(1); ACCESS_ONCE(stutter_pause_test) = 1; } if (!torture_must_stop()) -- cgit v1.2.3 From 945fa9c631b04febe295a3a2a00c7e4a3cfb97db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Mar 2014 14:15:28 -0800 Subject: torture: Dump ftrace buffer when the RCU grace period stalls This commit adds a call to rcutorture_trace_dump() to dump the ftrace buffer when the RCU grace period stalls in order to help debug the stall. Note that this is different than the RCU CPU stall warning, as it is rcutorture detecting the stall rather than the underlying RCU implementation. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 80d2d2440210..9decce0f110c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1034,6 +1034,7 @@ rcu_torture_printk(char *page) "??? Writer stall state %d g%lu c%lu f%#x\n", rcu_torture_writer_state, gpnum, completed, flags); + rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; } -- cgit v1.2.3 From afea227fd4acf4f097a9e77bbc2f07d4856ebd01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Mar 2014 07:10:41 -0700 Subject: rcutorture: Export RCU grace-period kthread wait state to rcutorture This commit allows rcutorture to print additional state for the RCU grace-period kthreads in cases where RCU seems reluctant to start a new grace period. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 4 ++++ include/linux/rcutree.h | 1 + kernel/rcu/rcutorture.c | 1 + kernel/rcu/tree.c | 17 +++++++++++++++++ kernel/rcu/tree.h | 8 +++++++- 5 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 425c659d54e5..d40a6a451330 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -119,6 +119,10 @@ static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) +{ +} + static inline void rcu_cpu_stall_reset(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index a59ca05fd4e3..3e2f5d432743 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -84,6 +84,7 @@ extern unsigned long rcutorture_vernum; long rcu_batches_completed(void); long rcu_batches_completed_bh(void); long rcu_batches_completed_sched(void); +void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9decce0f110c..37ae5e1d4a1d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1034,6 +1034,7 @@ rcu_torture_printk(char *page) "??? Writer stall state %d g%lu c%lu f%#x\n", rcu_torture_writer_state, gpnum, completed, flags); + show_rcu_gp_kthreads(); rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3d15b5a82ae8..93e64381aa2a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -279,6 +279,21 @@ void rcu_bh_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + pr_info("%s: wait state: %d ->state: %#lx\n", + rsp->name, rsp->gp_state, rsp->gp_kthread->state); + /* sched_show_task(rsp->gp_kthread); */ + } +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + /* * Record the number of times rcutorture tests have been initiated and * terminated. This information allows the debugfs tracing stats to be @@ -1626,6 +1641,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwait")); + rsp->gp_state = RCU_GP_WAIT_GPS; wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); @@ -1653,6 +1669,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswait")); + rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, ((gf = ACCESS_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..c2fd1e722879 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -406,7 +406,8 @@ struct rcu_state { unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ wait_queue_head_t gp_wq; /* Where GP task waits. */ - int gp_flags; /* Commands for GP task. */ + short gp_flags; /* Commands for GP task. */ + short gp_state; /* GP kthread sleep state. */ /* End of fields guarded by root rcu_node's lock. */ @@ -469,6 +470,11 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ +#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ -- cgit v1.2.3 From ac1bea85781e9004da9b3e8a4b097c18492d857c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 16 Mar 2014 21:36:25 -0700 Subject: sched,rcu: Make cond_resched() report RCU quiescent states Given a CPU running a loop containing cond_resched(), with no other tasks runnable on that CPU, RCU will eventually report RCU CPU stall warnings due to lack of quiescent states. Fortunately, every call to cond_resched() is a perfectly good quiescent state. Unfortunately, invoking rcu_note_context_switch() is a bit heavyweight for cond_resched(), especially given the need to disable preemption, and, for RCU-preempt, interrupts as well. This commit therefore maintains a per-CPU counter that causes cond_resched(), cond_resched_lock(), and cond_resched_softirq() to call rcu_note_context_switch(), but only about once per 256 invocations. This ratio was chosen in keeping with the relative time constants of RCU grace periods. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 18 ++++++++++++++++++ kernel/sched/core.c | 7 ++++++- 3 files changed, 60 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 82973738125b..97cc8d6679b4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,7 @@ #include #include #include +#include #include extern int rcu_expedited; /* for sysctl */ @@ -286,6 +287,41 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ +DECLARE_PER_CPU(int, rcu_cond_resched_count); +void rcu_resched(void); + +/* + * Is it time to report RCU quiescent states? + * + * Note unsynchronized access to rcu_cond_resched_count. Yes, we might + * increment some random CPU's count, and possibly also load the result from + * yet another CPU's count. We might even clobber some other CPU's attempt + * to zero its counter. This is all OK because the goal is not precision, + * but rather reasonable amortization of rcu_note_context_switch() overhead + * and extremely high probability of avoiding RCU CPU stall warnings. + * Note that this function has to be preempted in just the wrong place, + * many thousands of times in a row, for anything bad to happen. + */ +static inline bool rcu_should_resched(void) +{ + return raw_cpu_inc_return(rcu_cond_resched_count) >= + RCU_COND_RESCHED_LIM; +} + +/* + * Report quiscent states to RCU if it is time to do so. + */ +static inline void rcu_cond_resched(void) +{ + if (unlikely(rcu_should_resched())) + rcu_resched(); +} + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..ed7a0d72562c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -338,3 +338,21 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +DEFINE_PER_CPU(int, rcu_cond_resched_count); + +/* + * Report a set of RCU quiescent states, for use by cond_resched() + * and friends. Out of line due to being called infrequently. + */ +void rcu_resched(void) +{ + preempt_disable(); + __this_cpu_write(rcu_cond_resched_count, 0); + rcu_note_context_switch(smp_processor_id()); + preempt_enable(); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..9f530c9ed911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4051,6 +4051,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { + rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4069,15 +4070,18 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { + bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched) { + if (spin_needbreak(lock) || resched || need_rcu_resched) { spin_unlock(lock); if (resched) __cond_resched(); + else if (unlikely(need_rcu_resched)) + rcu_resched(); else cpu_relax(); ret = 1; @@ -4091,6 +4095,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); + rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); -- cgit v1.2.3 From 64e4b43ae050146fcfafe696e61efc306f73d449 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Mar 2014 10:26:35 -0700 Subject: rcutorture: Make rcu_torture_reader() use cond_resched() The rcu_torture_reader() function currently uses schedule(). This commit therefore speeds things up a bit by substituting cond_resched(). This change makes rcu_torture_reader() more CPU-bound, so this commit also adjusts the number of readers (the "nreaders" module parameter, which feeds into the "nrealreaders" variable) to allow one CPU to be free of readers on SMP systems. The point of this is to increase the probability that readers will be watching while an updater makes a change. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37ae5e1d4a1d..693a90fcee83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -942,7 +942,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - schedule(); + cond_resched(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) @@ -1482,10 +1482,13 @@ rcu_torture_init(void) if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - if (nreaders >= 0) + if (nreaders >= 0) { nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); + } else { + nrealreaders = num_online_cpus() - 1; + if (nrealreaders <= 0) + nrealreaders = 1; + } rcu_torture_print_module_parms(cur_ops, "Start of test"); /* Set up the freelist. */ -- cgit v1.2.3 From 5ed63b199c5b58ed150ce50f1ea68de54669b13f Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Mon, 17 Mar 2014 15:21:21 +0200 Subject: torture: Notice if an all-zero cpumask is passed inside a critical section In torture_shuffle_tasks function, the check if an all-zero mask can be passed to set_cpus_allowed_ptr() is redundant after clearing the shuffle_idle_cpu bit. If the mask had more than one bit set, after clearing a bit it has at least one bit set. If the mask had only one bit set, a check is made at the beginning, where the function returns, as there is no need to shuffle only one cpu. Also, this code is executed inside a critical section, delimited by get_online_cpus(), and put_online_cpus(), preventing CPUs from leaving between the check of num_online_cpus and the calls to set_cpus_allowed_ptr() function. Signed-off-by: Iulia Manda Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index bc0ee382b3c8..ae1723a4c751 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void) shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); if (shuffle_idle_cpu >= nr_cpu_ids) shuffle_idle_cpu = -1; - if (shuffle_idle_cpu != -1) { + else cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); - if (cpumask_empty(shuffle_tmp_mask)) { - put_online_cpus(); - return; - } - } mutex_lock(&shuffle_task_mutex); list_for_each_entry(stp, &shuffle_task_list, st_l) -- cgit v1.2.3 From d0d0606e2c13ad445a58b9d9547de617429cabf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Mar 2014 20:56:45 -0700 Subject: rcutorture: Check for rcu_torture_fqs creation errors The return value from torture_create_kthread() is currently ignored when creating the rcu_torture_fqs kthread. This commit therefore captures the return value so that it can be tested for errors. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 693a90fcee83..dfec2582899f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1577,7 +1577,8 @@ rcu_torture_init(void) fqs_duration = 0; if (fqs_duration) { /* Create the fqs thread */ - torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); + firsterr = torture_create_kthread(rcu_torture_fqs, NULL, + fqs_task); if (firsterr) goto unwind; } -- cgit v1.2.3 From a48f3fad4f97fe6a2522fe2f5b3054b4c48a8eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Mar 2014 15:57:41 -0700 Subject: rcutorture: Add tests for get_state_synchronize_rcu() This commit adds rcutorture testing for get_state_synchronize_rcu() and cond_synchronize_rcu(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 130 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dfec2582899f..0d27b9cc14e4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -58,6 +58,7 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); @@ -144,8 +145,10 @@ static int rcu_torture_writer_state; #define RTWS_REPLACE 2 #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 -#define RTWS_STUTTER 5 -#define RTWS_STOPPING 6 +#define RTWS_COND_GET 5 +#define RTWS_COND_SYNC 6 +#define RTWS_STUTTER 7 +#define RTWS_STOPPING 8 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -232,6 +235,8 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_state)(void); + void (*cond_sync)(unsigned long oldstate); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); @@ -283,10 +288,48 @@ static int rcu_torture_completed(void) return rcu_batches_completed(); } +/* + * Update callback in the pipe. This should be invoked after a grace period. + */ +static bool +rcu_torture_pipe_update_one(struct rcu_torture *rp) +{ + int i; + + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + return true; + } + return false; +} + +/* + * Update all callbacks in the pipe. Suitable for synchronous grace-period + * primitives. + */ +static void +rcu_torture_pipe_update(struct rcu_torture *old_rp) +{ + struct rcu_torture *rp; + struct rcu_torture *rp1; + + if (old_rp) + list_add(&old_rp->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + if (rcu_torture_pipe_update_one(rp)) { + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + static void rcu_torture_cb(struct rcu_head *p) { - int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); if (torture_must_stop_irq()) { @@ -294,16 +337,10 @@ rcu_torture_cb(struct rcu_head *p) /* The next initialization will pick up the pieces. */ return; } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; + if (rcu_torture_pipe_update_one(rp)) rcu_torture_free(rp); - } else { + else cur_ops->deferred_free(rp); - } } static int rcu_no_completed(void) @@ -331,6 +368,8 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, + .get_state = get_state_synchronize_rcu, + .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -705,16 +744,39 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool exp; + unsigned long gp_snap; + bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; int i; struct rcu_torture *rp; - struct rcu_torture *rp1; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET }; + int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); set_user_nice(current, MAX_NICE); + /* Initialize synctype[] array. If none set, take default. */ + if (!gp_cond1 && !gp_exp1 && !gp_normal1) + gp_cond1 = gp_exp1 = gp_normal1 = true; + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + synctype[nsynctypes++] = RTWS_COND_GET; + else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) + pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); + if (gp_exp1 && cur_ops->exp_sync) + synctype[nsynctypes++] = RTWS_EXP_SYNC; + else if (gp_exp && !cur_ops->exp_sync) + pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); + if (gp_normal1 && cur_ops->deferred_free) + synctype[nsynctypes++] = RTWS_DEF_FREE; + else if (gp_normal && !cur_ops->deferred_free) + pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + if (WARN_ONCE(nsynctypes == 0, + "rcu_torture_writer: No update-side primitives.\n")) { + rcu_torture_writer_state = RTWS_STOPPING; + torture_kthread_stopping("rcu_torture_writer"); + } + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; schedule_timeout_uninterruptible(1); @@ -736,32 +798,30 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(torture_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; cur_ops->deferred_free(old_rp); - } else { + break; + case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET: + rcu_torture_writer_state = RTWS_COND_GET; + gp_snap = cur_ops->get_state(); + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + rcu_torture_writer_state = RTWS_COND_SYNC; + cur_ops->cond_sync(gp_snap); + rcu_torture_pipe_update(old_rp); + break; + default: + WARN_ON_ONCE(1); + break; } } rcutorture_record_progress(++rcu_torture_current_version); -- cgit v1.2.3 From f0bf8fab4f311cffa869a462ffd182465c4caee6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Mar 2014 16:17:56 -0700 Subject: rcutorture: Explicitly test synchronous grace-period primitives The original rcu_torture_writer() avoided testing the synchronous grace-period primitives because they were simply wrappers around call_rcu() invocations. The testing of these synchronous primitives was delegated to the fake writers. However, there really is no excuse not to test them, especially in the case of SRCU, where the wrappering is somewhat more elaborate. This commit therefore makes the default rcutorture parameters cause rcu_torture_writer() to include synchronous grace-period primitives in its testing. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d27b9cc14e4..8b2748486c17 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -62,6 +62,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); @@ -147,8 +148,9 @@ static int rcu_torture_writer_state; #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 #define RTWS_COND_SYNC 6 -#define RTWS_STUTTER 7 -#define RTWS_STOPPING 8 +#define RTWS_SYNC 7 +#define RTWS_STUTTER 8 +#define RTWS_STOPPING 9 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -746,19 +748,21 @@ rcu_torture_writer(void *arg) { unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; + bool gp_sync1 = gp_sync; int i; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET }; + int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, + RTWS_COND_GET, RTWS_SYNC }; int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); set_user_nice(current, MAX_NICE); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1) - gp_cond1 = gp_exp1 = gp_normal1 = true; + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) + gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) synctype[nsynctypes++] = RTWS_COND_GET; else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) @@ -771,8 +775,17 @@ rcu_torture_writer(void *arg) synctype[nsynctypes++] = RTWS_DEF_FREE; else if (gp_normal && !cur_ops->deferred_free) pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + if (gp_sync1 && cur_ops->sync) + synctype[nsynctypes++] = RTWS_SYNC; + else if (gp_sync && !cur_ops->sync) + pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); } @@ -819,6 +832,11 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_SYNC: + rcu_torture_writer_state = RTWS_SYNC; + cur_ops->sync(); + rcu_torture_pipe_update(old_rp); + break; default: WARN_ON_ONCE(1); break; -- cgit v1.2.3 From 424c1b682051c48e1da24e503b96a8a72e114ea4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 08:58:27 -0700 Subject: rcutorture: Add missing destroy_timer_on_stack() The rcu_torture_reader() function uses an on-stack timer_list structure which it initializes with setup_timer_on_stack(). However, it fails to use destroy_timer_on_stack() before exiting, which results in leaking a tracking object if DEBUG_OBJECTS is enabled. This commit therefore invokes destroy_timer_on_stack() to avoid this leakage. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b2748486c17..a7d18069a96d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1023,8 +1023,10 @@ rcu_torture_reader(void *arg) cond_resched(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); - if (irqreader && cur_ops->irq_capable) + if (irqreader && cur_ops->irq_capable) { del_timer_sync(&t); + destroy_timer_on_stack(&t); + } torture_kthread_stopping("rcu_torture_reader"); return 0; } -- cgit v1.2.3 From 48d684fdad83d7525a557e6ff9c37811b6a9947b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Apr 2014 14:57:13 -0700 Subject: rcutorture: Run rcu_torture_writer at normal priority There are usually lots of readers and only one writer, so if there has to be a choice, we would want rcu_torture_writer to win. This commit therefore removes the set_user_nice() from rcu_torture_writer(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a7d18069a96d..4b7b97ff1195 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -758,7 +758,6 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - set_user_nice(current, MAX_NICE); /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) -- cgit v1.2.3 From d065eacfdb9d47010f120e9310d7fc8ef2eba272 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Apr 2014 17:17:35 -0700 Subject: locktorture: Remove reference to nonexistent Kconfig parameter The locktorture module references CONFIG_LOCK_TORTURE_TEST_RUNNABLE, which does not exist. Which is a good thing, because otherwise randconfig testing could enable both rcutorture and locktorture concurrently, which the torture tests are not set up for. This commit therefore removes the reference, so that test is runnable immediately only when inserted as a module. Reported-by: Paul Bolle Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b0d3e3c50672..1952466c7db5 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -82,14 +82,14 @@ struct lock_writer_stress_stats { }; static struct lock_writer_stress_stats *lwsa; -#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); -- cgit v1.2.3 From 5228084eed8d54c426c7abde3be66daf8e1b0e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Apr 2014 09:14:11 -0700 Subject: torture: Check for multiple concurrent torture tests The torture tests are designed to run in isolation, but do not enforce this isolation. This commit therefore checks for concurrent torture tests, and refuses to start new tests while old tests are running. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 3 ++- kernel/rcu/rcutorture.c | 3 ++- kernel/torture.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index b2e2b468e511..f998574247fd 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -81,7 +81,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -void torture_init_begin(char *ttype, bool v, int *runnable); +bool torture_init_begin(char *ttype, bool v, int *runnable); void torture_init_end(void); bool torture_cleanup(void); bool torture_must_stop(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 1952466c7db5..dbafeac18e4d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -355,7 +355,8 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, }; - torture_init_begin(torture_type, verbose, &locktorture_runnable); + if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4b7b97ff1195..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1536,7 +1536,8 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; - torture_init_begin(torture_type, verbose, &rcutorture_runnable); + if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/torture.c b/kernel/torture.c index ae1723a4c751..0ed0b49d2ce1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -599,14 +599,20 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -void __init torture_init_begin(char *ttype, bool v, int *runnable) +bool __init torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); + if (torture_type != NULL) { + pr_alert("torture_init_begin: refusing %s init: %s running", + ttype, torture_type); + mutex_unlock(&fullstop_mutex); + return false; + } torture_type = ttype; verbose = v; torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; - + return true; } EXPORT_SYMBOL_GPL(torture_init_begin); @@ -645,6 +651,9 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + mutex_lock(&fullstop_mutex); + torture_type = NULL; + mutex_unlock(&fullstop_mutex); return false; } EXPORT_SYMBOL_GPL(torture_cleanup); -- cgit v1.2.3 From 2b3f8ffe46f843ac3ae589c25603d66da4a99620 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 16 Apr 2014 13:42:09 -0700 Subject: torture: Remove __init from torture_init_begin/end Loading rcutorture as a module (as opposed to building it directly into the kernel) results in the following splat: [Wed Apr 16 15:29:33 2014] BUG: unable to handle kernel paging request at ffffffffa0003000 [Wed Apr 16 15:29:33 2014] IP: [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] PGD 1c0f067 PUD 1c10063 PMD 378a6067 PTE 0 [Wed Apr 16 15:29:33 2014] Oops: 0010 [#1] SMP [Wed Apr 16 15:29:33 2014] Modules linked in: rcutorture(+) torture [Wed Apr 16 15:29:33 2014] CPU: 0 PID: 4257 Comm: modprobe Not tainted 3.15.0-rc1 #10 [Wed Apr 16 15:29:33 2014] Hardware name: innotek GmbH VirtualBox, BIOS VirtualBox 12/01/2006 [Wed Apr 16 15:29:33 2014] task: ffff8800db1e88d0 ti: ffff8800db25c000 task.ti: ffff8800db25c000 [Wed Apr 16 15:29:33 2014] RIP: 0010:[] [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] RSP: 0018:ffff8800db25dca0 EFLAGS: 00010282 [Wed Apr 16 15:29:33 2014] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [Wed Apr 16 15:29:33 2014] RDX: ffffffffa00090a8 RSI: 0000000000000001 RDI: ffffffffa0008337 [Wed Apr 16 15:29:33 2014] RBP: ffff8800db25dd50 R08: 0000000000000000 R09: 0000000000000000 [Wed Apr 16 15:29:33 2014] R10: ffffea000357b680 R11: ffffffff8113257a R12: ffffffffa000d000 [Wed Apr 16 15:29:33 2014] R13: ffffffffa00094c0 R14: ffffffffa0009510 R15: 0000000000000001 [Wed Apr 16 15:29:33 2014] FS: 00007fee30ce5700(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000 [Wed Apr 16 15:29:33 2014] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [Wed Apr 16 15:29:33 2014] CR2: ffffffffa0003000 CR3: 00000000d5eb1000 CR4: 00000000000006f0 [Wed Apr 16 15:29:33 2014] Stack: [Wed Apr 16 15:29:33 2014] ffffffffa000d02c 0000000000000000 ffff88021700d400 0000000000000000 [Wed Apr 16 15:29:33 2014] ffff8800db25dd40 ffffffff81647951 ffff8802162bd000 ffff88021541846c [Wed Apr 16 15:29:33 2014] 0000000000000000 ffffffff817dbe2d ffffffff817dbe2d 0000000000000001 [Wed Apr 16 15:29:33 2014] Call Trace: [Wed Apr 16 15:29:33 2014] [] ? rcu_torture_init+0x2c/0x8b4 [rcutorture] [Wed Apr 16 15:29:33 2014] [] ? netlink_broadcast_filtered+0x121/0x3a0 [Wed Apr 16 15:29:33 2014] [] ? mutex_lock+0xd/0x2a [Wed Apr 16 15:29:33 2014] [] ? mutex_lock+0xd/0x2a [Wed Apr 16 15:29:33 2014] [] ? trace_module_notify+0x62/0x1d0 [Wed Apr 16 15:29:33 2014] [] ? 0xffffffffa000cfff [Wed Apr 16 15:29:33 2014] [] do_one_initcall+0xfa/0x140 [Wed Apr 16 15:29:33 2014] [] ? __blocking_notifier_call_chain+0x5e/0x80 [Wed Apr 16 15:29:33 2014] [] load_module+0x1931/0x21b0 [Wed Apr 16 15:29:33 2014] [] ? show_initstate+0x50/0x50 [Wed Apr 16 15:29:33 2014] [] SyS_init_module+0x9e/0xc0 [Wed Apr 16 15:29:33 2014] [] system_call_fastpath+0x16/0x1b [Wed Apr 16 15:29:33 2014] Code: Bad RIP value. [Wed Apr 16 15:29:33 2014] RIP [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] RSP [Wed Apr 16 15:29:33 2014] CR2: ffffffffa0003000 [Wed Apr 16 15:29:33 2014] ---[ end trace 3e88c173037af84b ]--- This splat is due to the fact that torture_init_begin() and torture_init_end() are both marked with __init, despite their use at runtime. This commit therefore removes __init from both functions. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 0ed0b49d2ce1..40bb511cca48 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -599,7 +599,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool __init torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -619,7 +619,7 @@ EXPORT_SYMBOL_GPL(torture_init_begin); /* * Tell the torture module that initialization is complete. */ -void __init torture_init_end(void) +void torture_init_end(void) { mutex_unlock(&fullstop_mutex); register_reboot_notifier(&torture_shutdown_nb); -- cgit v1.2.3 From e534165bbf6a04d001748c573c7d6a7bae3713a5 Mon Sep 17 00:00:00 2001 From: Uma Sharma Date: Sun, 23 Mar 2014 22:32:09 -0700 Subject: rcu: Variable name changed in tree_plugin.h and used in tree.c The variable and struct both having the name "rcu_state" confuses sparse in some situations, so this commit changes the variable to "rcu_state_p" in order to avoid this confusion. This also makes things easier for human readers. Signed-off-by: Uma Sharma [ paulmck: Changed the declaration and several additional uses. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++-------- kernel/rcu/tree_plugin.h | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3bbe48939e47..3e3f13e8b429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state; +static struct rcu_state *rcu_state_p; LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ @@ -275,7 +275,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(rcu_state); + force_quiescent_state(rcu_state_p); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -327,7 +327,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, switch (test_type) { case RCU_FLAVOR: - rsp = rcu_state; + rsp = rcu_state_p; break; case RCU_BH_FLAVOR: rsp = &rcu_bh_state; @@ -910,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * we will beat on the first one until it gets unstuck, then move * to the next. Only do this for the primary flavor of RCU. */ - if (rdp->rsp == rcu_state && + if (rdp->rsp == rcu_state_p && ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { rdp->rsp->jiffies_resched += 5; resched_cpu(rdp->cpu); @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, rcu_state, -1, 1); + __call_rcu(head, func, rcu_state_p, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2787,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void) * time-consuming work between get_state_synchronize_rcu() * and cond_synchronize_rcu(). */ - return smp_load_acquire(&rcu_state->gpnum); + return smp_load_acquire(&rcu_state_p->gpnum); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -2813,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate) * Ensure that this load happens before any RCU-destructive * actions the caller might carry out after we return. */ - newstate = smp_load_acquire(&rcu_state->completed); + newstate = smp_load_acquire(&rcu_state_p->completed); if (ULONG_CMP_GE(oldstate, newstate)) synchronize_rcu(); } @@ -3354,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2e579c38bd91..29977ae84e7e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state = &rcu_preempt_state; +static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -947,7 +947,7 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static struct rcu_state *rcu_state = &rcu_sched_state; +static struct rcu_state *rcu_state_p = &rcu_sched_state; /* * Tell them what RCU they are running. @@ -1468,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rnp = rcu_get_root(rcu_state_p); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } return 0; } @@ -1480,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3 From 1f0b63866fc1be700260547be8edf8e6f0af37f2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 May 2014 23:29:57 +0200 Subject: ACPI / PM: Hold ACPI scan lock over the "freeze" sleep state The "freeze" sleep state suffers from the same issue that was addressed by commit ad07277e82de (ACPI / PM: Hold acpi_scan_lock over system PM transitions) for ACPI sleep states, that is, things break if ->remove() is called for devices whose system resume callbacks haven't been executed yet. It also can be addressed in the same way, by holding the ACPI scan lock over the "freeze" sleep state and PM transitions to and from that state, but ->begin() and ->end() platform operations for the "freeze" sleep state are needed for this purpose. This change has been tested on Acer Aspire S5 with Thunderbolt. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 18 ++++++++++++++++++ include/linux/suspend.h | 7 +++++++ kernel/power/suspend.c | 15 +++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 2281ca31c1bc..c11e3795431b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -612,6 +612,22 @@ static const struct platform_suspend_ops acpi_suspend_ops_old = { .recover = acpi_pm_finish, }; +static int acpi_freeze_begin(void) +{ + acpi_scan_lock_acquire(); + return 0; +} + +static void acpi_freeze_end(void) +{ + acpi_scan_lock_release(); +} + +static const struct platform_freeze_ops acpi_freeze_ops = { + .begin = acpi_freeze_begin, + .end = acpi_freeze_end, +}; + static void acpi_sleep_suspend_setup(void) { int i; @@ -622,7 +638,9 @@ static void acpi_sleep_suspend_setup(void) suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); + freeze_set_ops(&acpi_freeze_ops); } + #else /* !CONFIG_SUSPEND */ static inline void acpi_sleep_suspend_setup(void) {} #endif /* !CONFIG_SUSPEND */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index f73cabf59012..91d66fd8dce1 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -187,6 +187,11 @@ struct platform_suspend_ops { void (*recover)(void); }; +struct platform_freeze_ops { + int (*begin)(void); + void (*end)(void); +}; + #ifdef CONFIG_SUSPEND /** * suspend_set_ops - set platform dependent suspend operations @@ -194,6 +199,7 @@ struct platform_suspend_ops { */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); +extern void freeze_set_ops(const struct platform_freeze_ops *ops); extern void freeze_wake(void); /** @@ -220,6 +226,7 @@ extern int pm_suspend(suspend_state_t state); static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } +static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {} static inline void freeze_wake(void) {} #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..73a905f83972 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -38,6 +38,7 @@ const char *const pm_states[PM_SUSPEND_MAX] = { }; static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops; static bool need_suspend_ops(suspend_state_t state) { @@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state) static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ + lock_system_sleep(); + freeze_ops = ops; + unlock_system_sleep(); +} + static void freeze_begin(void) { suspend_freeze_wake = false; @@ -269,6 +277,10 @@ int suspend_devices_and_enter(suspend_state_t state) error = suspend_ops->begin(state); if (error) goto Close; + } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { + error = freeze_ops->begin(); + if (error) + goto Close; } suspend_console(); suspend_test_start(); @@ -294,6 +306,9 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); + else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) + freeze_ops->end(); + trace_machine_suspend(PWR_EVENT_EXIT); return error; -- cgit v1.2.3 From 7b6ef1262549f6afc5c881aaef80beb8fd15f908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:05 +0000 Subject: genirq: Provide generic hwirq allocation facility Not really the solution to the problem, but at least it confines the mess in the core code and allows to get rid of the create/destroy_irq variants from hell, i.e. 3 implementations with different semantics plus the x86 specific variants __create_irqs and create_irq_nr which have been invented in another circle of hell. x86 : x86 should be converted to irq domains and I'm deliberately making it impossible to do the multi-vector MSI support by adding more crap to the current mess. It's not that hard to do and I'm really tired of the trainwrecks which have been invented by baindaid engineering so far. Any attempt to do multi-vector MSI or ioapic hotplug without converting to irq domains is NAKed hereby. tile: Might use irq domains as well, but it has a very limited interrupt space, so handling it via this functionality might be the right thing to do even in the long run. ia64: That's an hopeless case, as I doubt that anyone has the stomach to rewrite the homebrewn dynamic allocation facilities. I stared at it for a couple of hours and gave up. The create/destroy_irq mess could be made private to itanic right away if there wouldn't be the iommu/dmar driver being shared with x86. So to do that I'm going to add a separate ia64 specific implementation later in order not to deep-six itanic right away. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: Chris Metcalf Cc: Fenghua Yu Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154334.208629358@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 15 +++++++++++++++ kernel/irq/Kconfig | 5 +++++ kernel/irq/irqdesc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5c57efb863d0..c75dd161d37f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -637,6 +637,21 @@ static inline int irq_reserve_irq(unsigned int irq) return irq_reserve_irqs(irq, 1); } +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +unsigned int irq_alloc_hwirqs(int cnt, int node); +static inline unsigned int irq_alloc_hwirq(int node) +{ + return irq_alloc_hwirqs(1, node); +} +void irq_free_hwirqs(unsigned int from, int cnt); +static inline void irq_free_hwirq(unsigned int irq) +{ + return irq_free_hwirqs(irq, 1); +} +int arch_setup_hwirq(unsigned int irq, int node); +void arch_teardown_hwirq(unsigned int irq); +#endif + #ifndef irq_reg_writel # define irq_reg_writel(val, addr) writel(val, addr) #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..a83f10e406c1 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -17,6 +17,11 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Facility to allocate a hardware interrupt. This is legacy support +# and should not be used in new code. Use irq domains instead. +config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb07f2928f4b..f388ade5e792 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -396,6 +396,57 @@ err: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +/** + * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware + * @cnt: number of interrupts to allocate + * @node: node on which to allocate + * + * Returns an interrupt number > 0 or 0, if the allocation fails. + */ +unsigned int irq_alloc_hwirqs(int cnt, int node) +{ + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + + if (irq < 0) + return 0; + + for (i = irq; cnt > 0; i++, cnt--) { + if (arch_setup_hwirq(i, node)) + goto err; + irq_clear_status_flags(i, _IRQ_NOREQUEST); + } + return irq; + +err: + for (i--; i >= irq; i--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(irq, cnt); + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); + +/** + * irq_free_hwirqs - Free irq descriptor and cleanup the hardware + * @from: Free from irq number + * @cnt: number of interrupts to free + * + */ +void irq_free_hwirqs(unsigned int from, int cnt) +{ + int i; + + for (i = from; cnt > 0; i++, cnt--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(from, cnt); +} +EXPORT_SYMBOL_GPL(irq_free_hwirqs); +#endif + /** * irq_reserve_irqs - mark irqs allocated * @from: mark from irq number -- cgit v1.2.3 From f63b6a05f2b11537612266a8b27a61f412344a1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:21 +0000 Subject: genirq: Replace reserve_irqs in core code We want to get rid of the public interface. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.061990194@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 5 ++--- kernel/irq/internals.h | 6 ++++++ kernel/irq/irqdesc.c | 7 +++++++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is - * already marked, and this call is harmless. + * allocated_irqs. */ - irq_reserve_irq(irq); + irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..c4065e3edbc3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); +#ifdef CONFIG_SPARSE_IRQ +static inline void irq_mark_irq(unsigned int irq) { } +#else +extern void irq_mark_irq(unsigned int irq); +#endif + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f388ade5e792..24029348729b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -299,6 +299,13 @@ static int irq_expand_nr_irqs(unsigned int nr) return -ENOMEM; } +void irq_mark_irq(unsigned int irq) +{ + mutex_lock(&sparse_irq_lock); + bitmap_set(allocated_irqs, irq, 1); + mutex_unlock(&sparse_irq_lock); +} + #endif /* !CONFIG_SPARSE_IRQ */ /** -- cgit v1.2.3 From 1d008353ba088fdec0b2a944e140ff9154a5fb20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:21 +0000 Subject: genirq: Remove irq_reserve_irq[s] No more users. And it's not going to come back. If you need hotplugable irq chips, use irq domains. Signed-off-by: Thomas Gleixner Reviewed-and-acked-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.302183048@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 ------- kernel/irq/irqdesc.c | 25 ------------------------- 2 files changed, 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index ac9634286f42..2110f46fcafa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -617,18 +617,11 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, irq_alloc_descs(-1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); -int irq_reserve_irqs(unsigned int from, unsigned int cnt); - static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } -static inline int irq_reserve_irq(unsigned int irq) -{ - return irq_reserve_irqs(irq, 1); -} - #ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ unsigned int irq_alloc_hwirqs(int cnt, int node); static inline unsigned int irq_alloc_hwirq(int node) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 24029348729b..d514ed6080e1 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -454,31 +454,6 @@ void irq_free_hwirqs(unsigned int from, int cnt) EXPORT_SYMBOL_GPL(irq_free_hwirqs); #endif -/** - * irq_reserve_irqs - mark irqs allocated - * @from: mark from irq number - * @cnt: number of irqs to mark - * - * Returns 0 on success or an appropriate error code - */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) -{ - unsigned int start; - int ret = 0; - - if (!cnt || (from + cnt) > nr_irqs) - return -EINVAL; - - mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); - if (start == from) - bitmap_set(allocated_irqs, start, cnt); - else - ret = -EEXIST; - mutex_unlock(&sparse_irq_lock); - return ret; -} - /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search -- cgit v1.2.3 From c940e01c94e73a2a5318f1b82038e0746aaec753 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:22 +0000 Subject: genirq: Replace dynamic_irq_init/cleanup Create a new interface and confine it with a config switch which makes clear that this is just legacy support and not to be used for new code. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.574437049@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/Kconfig | 4 ++++ kernel/irq/irqdesc.c | 7 +++++++ 3 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 2110f46fcafa..8ff71d14365a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -637,6 +637,10 @@ int arch_setup_hwirq(unsigned int irq, int node); void arch_teardown_hwirq(unsigned int irq); #endif +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq); +#endif + #ifndef irq_reg_writel # define irq_reg_writel(val, addr) writel(val, addr) #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a83f10e406c1..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -5,6 +5,10 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool +# Legacy support, required for itanic +config GENERIC_IRQ_LEGACY + bool + # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d514ed6080e1..7f267799a717 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -306,6 +306,13 @@ void irq_mark_irq(unsigned int irq) mutex_unlock(&sparse_irq_lock); } +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} +#endif + #endif /* !CONFIG_SPARSE_IRQ */ /** -- cgit v1.2.3 From d8179bc0db8d0c9654d5de43de2874bf6d0a58fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:23 +0000 Subject: genirq: Remove dynamic_irq mess No more users. Get rid of the cruft. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154341.012847637@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 10 ---------- kernel/irq/irqdesc.c | 23 +++++++---------------- 2 files changed, 7 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8ff71d14365a..0d998d8b01d8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -525,16 +525,6 @@ static inline void irq_set_percpu_devid_flags(unsigned int irq) IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } -/* - * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and - * irq_free_desc instead. - */ -extern void dynamic_irq_cleanup(unsigned int irq); -static inline void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_cleanup(irq); -} - /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7f267799a717..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc), NULL); + raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -309,7 +314,7 @@ void irq_mark_irq(unsigned int irq) #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + free_desc(irq); } #endif @@ -522,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq) return 0; } -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); -- cgit v1.2.3 From f6514be5fe7fe796041b673bad769510414ff2b9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 May 2014 19:08:46 +0300 Subject: PM / hibernate: Fix memory corruption in resumedelay_setup() In the original code "resume_delay" is an int so on 64 bits, the call to kstrtoul() will cause memory corruption. We may as well fix a style issue here as well and make "resume_delay" unsigned int, since that's what we pass to ssleep(). Fixes: 317cf7e5e85e (PM / hibernate: convert simple_strtoul to kstrtoul) Signed-off-by: Dan Carpenter Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2377ff72994c..df88d55dc436 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -35,7 +35,7 @@ static int nocompress; static int noresume; static int resume_wait; -static int resume_delay; +static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; @@ -1115,7 +1115,7 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + int rc = kstrtouint(str, 0, &resume_delay); if (rc) return rc; -- cgit v1.2.3 From 12665b35b0b48c9583ee1b8f0a403dc708fb4a92 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 10 May 2014 13:10:59 +0200 Subject: perf/events/core: Drop unused variable after cleanup ... in 3a497f48637 ("perf: Simplify perf_event_exit_task_context()") Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399720259-28275-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ab734fbaeeb..ed50b0943213 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7431,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *tmp; + struct perf_event *child_event; struct perf_event_context *child_ctx; unsigned long flags; -- cgit v1.2.3 From 61f38db3e3c0e4c3be0858750e2cabeadaecac0c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 26 Apr 2014 23:15:35 -0700 Subject: rcu: Provide API to suppress stall warnings while sysrc runs Some sysrq handlers can run for a long time, because they dump a lot of data onto a serial console. Having RCU stall warnings pop up in the middle of them only makes the problem worse. This commit provides rcu_sysrq_start() and rcu_sysrq_end() APIs to temporarily suppress RCU CPU stall warnings while a sysrq request is handled. Signed-off-by: Rik van Riel [ paulmck: Fix TINY_RCU build error. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++++++++ kernel/rcu/update.c | 12 ++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9ccd644c1234..5a75d19aa661 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -248,6 +248,18 @@ void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +#ifdef CONFIG_RCU_STALL_COMMON +void rcu_sysrq_start(void); +void rcu_sysrq_end(void); +#else /* #ifdef CONFIG_RCU_STALL_COMMON */ +static inline void rcu_sysrq_start(void) +{ +} +static inline void rcu_sysrq_end(void) +{ +} +#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ + #ifdef CONFIG_RCU_USER_QS void rcu_user_enter(void); void rcu_user_exit(void); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ed7a0d72562c..a2aeb4df0f60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void) return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; } +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { rcu_cpu_stall_suppress = 1; -- cgit v1.2.3 From 049d595a4db3b3a20fc252298010f0545ef659a3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 9 May 2014 19:18:08 +0100 Subject: PM / OPP: Make OPP invisible to users in Kconfig The OPP code is an in kernel library selected by its users, there is no no architecture code required to implement it and enabling it without a user just increases the kernel size. Since the users select rather than depend on it just remove the ability to directly set the option from Kconfig. Signed-off-by: Mark Brown Acked-by: Nishanth Menon Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3d..9a83d780facd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,8 +257,7 @@ config ARCH_HAS_OPP bool config PM_OPP - bool "Operating Performance Point (OPP) Layer library" - depends on ARCH_HAS_OPP + bool ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This -- cgit v1.2.3 From 3944a9274ef6cda0cc282daf0739832f661670f7 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 15 May 2014 15:59:20 -0700 Subject: sched: Fix exec_start/task_hot on migrated tasks task_hot checks exec_start on any runnable task, but if it has been migrated since the it last ran, then exec_start is a clock_task from another cpu. If the old cpu's clock_task was sufficiently far ahead of this cpu's then the task will not be considered for another migration until it has run. Instead reset exec_start whenever a task is migrated, since it is presumably no longer hot anyway. Signed-off-by: Ben Segall [ Made it compile. ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140515225920.7179.13924.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28ccf502c63c..dd3fa14a2998 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4544,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From e78c7bca56dab5ce4f22694f99115ec07e4935f6 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:28 +0200 Subject: sched: Simplify return logic in sched_copy_attr() The logic in this function is a little contorted, clean it up: * Rather than having chained gotos for the -EFBIG case, just return -EFBIG directly. * Now, the label 'out' is no longer needed, and 'ret' must be zero zero by the time we fall through to this point, so just return 0. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC24.9080201@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2551b6db470e..2318fc484d8b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3657,13 +3657,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** -- cgit v1.2.3 From 22400674945c562cf3c176d6c4178cd545e2dec9 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:33 +0200 Subject: sched: Simplify return logic in sched_read_attr() Gotos are chained pointlessly here, and the 'out' label can be dispensed with. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC29.9090503@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2318fc484d8b..a78c5b62a470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3821,7 +3821,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; @@ -3831,12 +3831,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** -- cgit v1.2.3 From 8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 11:40:37 -0400 Subject: sched: Call select_idle_sibling() when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: george.mccollister@gmail.com Cc: ktkhai@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140514114037.2d93266f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14a2998..429164d117ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- cgit v1.2.3 From c515db8cd311ef77b2dc7cbd6b695022655bb0f3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 May 2014 11:11:01 +0200 Subject: sched/numa: Fix initialization of sched_domain_topology for NUMA Jet Chen has reported a kernel panics when booting qemu-system-x86_64 with kvm64 cpu. A panic occured while building the sched_domain. In sched_init_numa, we create a new topology table in which both default levels and numa levels are copied. The last row of the table must have a null pointer in the mask field. The current implementation doesn't add this last row in the computation of the table size. So we add 1 row in the allocation size that will be used as the last row of the table. The kzalloc will ensure that the mask field is NULL. Reported-by: Jet Chen Tested-by: Jet Chen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1399972261-25693-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a78c5b62a470..45d077ed24fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6231,7 +6231,7 @@ static void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level) * + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit v1.2.3 From caffcdd8d27ba78730d5540396ce72ad022aff2c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 30 Apr 2014 14:39:38 +0100 Subject: sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() There is no need to zero struct sched_group member cpumask and struct sched_group_power member power since both structures are already allocated as zeroed memory in __sdt_alloc(). This patch has been tested with BUG_ON(!cpumask_empty(sched_group_cpus(sg))); and BUG_ON(sg->sgp->power); in build_sched_groups() on ARM TC2 and INTEL i5 M520 platform including CPU hotplug scenarios. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398865178-12577-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45d077ed24fb..6340c601475d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5794,8 +5794,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { -- cgit v1.2.3 From a9467fa3cd2d5bf39e7cb7d0706d29d7ef4df212 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:35:15 +0900 Subject: sched: Use clamp() and clamp_val() to make sys_nice() more readable Suggested-by: Kees Cook Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399541715-19568-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6340c601475d..f5605b6deea4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,17 +3057,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); nice = task_nice(current) + increment; - if (nice < MIN_NICE) - nice = MIN_NICE; - if (nice > MAX_NICE) - nice = MAX_NICE; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; -- cgit v1.2.3 From 52a08ef1f13a11289c9e18cd4cfb4e51c024058b Mon Sep 17 00:00:00 2001 From: Jason Low Date: Thu, 8 May 2014 17:49:22 -0700 Subject: sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() Currently, in idle_balance(), we update rq->next_balance when we pull_tasks. However, it is also important to update this in the !pulled_tasks case too. When the CPU is "busy" (the CPU isn't idle), rq->next_balance gets computed using sd->busy_factor (so we increase the balance interval when the CPU is busy). However, when the CPU goes idle, rq->next_balance could still be set to a large value that was computed with the sd->busy_factor. Thus, we need to also update rq->next_balance in idle_balance() in the cases where !pulled_tasks too, so that rq->next_balance gets updated without taking the busy_factor into account when the CPU is about to go idle. This patch makes rq->next_balance get updated independently of whether or not we pulled_task. Also, we add logic to ensure that we always traverse at least 1 of the sched domains to get a proper next_balance value for updating rq->next_balance. Additionally, since load_balance() modifies the sd->balance_interval, we need to re-obtain the sched domain's interval after the call to load_balance() in rebalance_domains() before we update rq->next_balance. This patch adds and uses 2 new helper functions, update_next_balance() and get_sd_balance_interval() to update next_balance and obtain the sched domain's balance_interval. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Link: http://lkml.kernel.org/r/1399596562.2200.7.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 429164d117ea..26ec6686a00b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6672,17 +6672,44 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; - int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); @@ -6692,8 +6719,15 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6703,15 +6737,16 @@ static int idle_balance(struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); @@ -6727,9 +6762,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; + update_next_balance(sd, 0, &next_balance); /* * Stop searching for tasks to pull if there are @@ -6753,15 +6786,11 @@ static int idle_balance(struct rq *this_rq) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - } -out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; @@ -7044,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -7069,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); -- cgit v1.2.3 From 72465447867b9de6b5cdea5d10f9781585136270 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 9 May 2014 03:00:14 +0400 Subject: sched, nohz: Change rq->nr_running to always use wrappers Sometimes ->nr_running may cross 2 but interrupt is not being sent to rq's cpu. In this case we don't reenable the timer. Looks like this may be the reason for rare unexpected effects, if nohz is enabled. Patch replaces all places of direct changing of nr_running and makes add_nr_running() caring about crossing border. Signed-off-by: Kirill Tkhai Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140508225830.2469.97461.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 8 ++++---- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 12 +++++++----- kernel/sched/stop_task.c | 4 ++-- 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b99075..e0a04ae1e0dd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; - inc_nr_running(rq_of_dl_rq(dl_rq)); + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - dec_nr_running(rq_of_dl_rq(dl_rq)); + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26ec6686a00b..f7cac2ba62ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3325,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -3376,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3908,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3968,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7795e292f4c9..0ebfd7a29472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -973,7 +973,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - rq->nr_running -= rt_rq->rt_nr_running; + sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; } @@ -989,7 +989,7 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) return; - rq->nr_running += rt_rq->rt_nr_running; + add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2cbe81308af..600e2291a75c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1206,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -static inline void inc_nr_running(struct rq *rq) +static inline void add_nr_running(struct rq *rq, unsigned count) { - rq->nr_running++; + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; #ifdef CONFIG_NO_HZ_FULL - if (rq->nr_running == 2) { + if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); @@ -1221,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq) #endif } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count) { - rq->nr_running--; + rq->nr_running -= count; } static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - inc_nr_running(rq); + add_nr_running(rq, 1); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - dec_nr_running(rq); + sub_nr_running(rq, 1); } static void yield_task_stop(struct rq *rq) -- cgit v1.2.3 From a803f0261bb2bb57aab5542af3174db43b2a3887 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 8 May 2014 13:47:39 -0500 Subject: sched: Initialize rq->age_stamp on processor start If the sched_clock time starts at a large value, the kernel will spin in sched_avg_update for a long time while rq->age_stamp catches up with rq->clock. The comment in kernel/sched/clock.c says that there is no strict promise that it starts at zero. So initialize rq->age_stamp when a cpu starts up to avoid this. I was seeing long delays on a simulator that didn't start the clock at zero. This might also be an issue on reboots on processors that don't re-initialize the timer to zero on reset, and when using kexec. Signed-off-by: Corey Minyard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399574859-11714-1-git-send-email-minyard@acm.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5605b6deea4..da302ca98f60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5089,10 +5089,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6970,6 +6980,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); -- cgit v1.2.3 From 7aa2c016db2162defff77f6f5731bff3f25e5175 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:33:49 +0900 Subject: sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/a568a1e3cc8e78648f41b5035fa5e381d36274da.1399532322.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- drivers/staging/android/binder.c | 2 +- include/linux/sched/prio.h | 16 ++++++++++++++++ kernel/sched/core.c | 2 +- kernel/sys.c | 6 +++--- 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 179b21b66504..9311bb67ec35 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -436,7 +436,7 @@ static void binder_set_nice(long nice) set_user_nice(current, nice); return; } - min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ac322583c820..d9cf5a5762d9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -41,4 +41,20 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + #endif /* _SCHED_PRIO_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da302ca98f60..321d800e4baa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) do_each_thread(g, p) { if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } -- cgit v1.2.3 From 4027d080854d1be96ef134a1c3024d5276114db6 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Fri, 9 May 2014 03:21:27 +0000 Subject: sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399605687-18094-1-git-send-email-xiaofeng.yan@huawei.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- kernel/sched/deadline.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 725eef121c9f..0f91d00efd87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,8 +1175,8 @@ struct sched_dl_entity { /* * Original scheduling parameters. Copied here from sched_attr - * during sched_setscheduler2(), they will remain the same until - * the next sched_setscheduler2(). + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). */ u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e0a04ae1e0dd..f9ca7d19781a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setscheduler()). + * parameters (through sched_setattr()). */ if (!dl_task(p) || dl_se->dl_new) goto unlock; -- cgit v1.2.3 From e63da03639cc9e6e83b62e7ef8ffdbb92421416a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 13:22:21 -0400 Subject: sched/numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140514132221.274b3463@annuminas.surriel.com --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2ba62ea..b899613f2bc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- cgit v1.2.3 From b1ad065e65f56103db8b97edbd218a271ff5b1bb Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 15 May 2014 13:03:06 -0400 Subject: sched/numa: Update migrate_improves/degrades_locality() Update the migrate_improves/degrades_locality() functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/20140515130306.61aae7db@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613f2bc6..503f750c2d25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- cgit v1.2.3 From 096aa33863a5e48de52d2ff30e0801b7487944f4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 16 May 2014 00:13:32 -0400 Subject: sched/numa: Decay ->wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20140516001332.67f91af2@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750c2d25..c9617b73bcc0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit v1.2.3 From e4c729664339e4be352d4c7434a5c6184285148d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 15:38:11 -0600 Subject: resources: Clarify sanity check message The resource map sanity check message is a bit confusing. Change it to be more readable: -resource map sanity check conflict: 0xfed10000 0xfed15fff 0xfed10000 0xfed13fff pnp 00:01 +resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 8957d686e29b..3c2237ac32db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (p->flags & IORESOURCE_BUSY) continue; - printk(KERN_WARNING "resource map sanity check conflict: " - "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", (unsigned long long)addr, (unsigned long long)(addr + size - 1), - (unsigned long long)p->start, - (unsigned long long)p->end, - p->name); + p->name, p); err = -1; break; } -- cgit v1.2.3 From 72e6ae285a1dbff553734985bedadf409d99c02d Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Tue, 29 Apr 2014 04:20:52 +0100 Subject: ARM: 8043/1: uprobes need icache flush after xol write After instruction write into xol area, on ARM V7 architecture code need to flush dcache and icache to sync them up for given set of addresses. Having just 'flush_dcache_page(page)' call is not enough - it is possible to have stale instruction sitting in icache for given xol area slot address. Introduce arch_uprobe_ixol_copy weak function that by default calls uprobes copy_to_page function and than flush_dcache_page function and on ARM define new one that handles xol slot copy in ARM specific way flush_uprobe_xol_access function shares/reuses implementation with/of flush_ptrace_access function and takes care of writing instruction to user land address space on given variety of different cache types on ARM CPUs. Because flush_uprobe_xol_access does not have vma around flush_ptrace_access was split into two parts. First that retrieves set of condition from vma and common that receives those conditions as flags. Note ARM cache flush function need kernel address through which instruction write happened, so instead of using uprobes copy_to_page function changed code to explicitly map page and do memcpy. Note arch_uprobe_copy_ixol function, in similar way as copy_to_user_page function, has preempt_disable/preempt_enable. Signed-off-by: Victor Kamensky Acked-by: Oleg Nesterov Reviewed-by: David A. Long Signed-off-by: Russell King --- arch/arm/include/asm/cacheflush.h | 2 ++ arch/arm/kernel/uprobes.c | 20 ++++++++++++++++++++ arch/arm/mm/flush.c | 33 ++++++++++++++++++++++++++++----- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 25 +++++++++++++++++-------- 5 files changed, 70 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 00af9fe435e6..fd43f7f55b70 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -487,4 +487,6 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len); #endif diff --git a/arch/arm/kernel/uprobes.c b/arch/arm/kernel/uprobes.c index f9bacee973bf..56adf9c1fde0 100644 --- a/arch/arm/kernel/uprobes.c +++ b/arch/arm/kernel/uprobes.c @@ -113,6 +113,26 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return 0; } +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + preempt_disable(); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + flush_uprobe_xol_access(page, vaddr, dst, len); + + preempt_enable(); + + kunmap_atomic(xol_page_kaddr); +} + + int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 3387e60e4ea3..43d54f5b26b9 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -104,17 +104,20 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig #define flush_icache_alias(pfn,vaddr,len) do { } while (0) #endif +#define FLAG_PA_IS_EXEC 1 +#define FLAG_PA_CORE_IN_MM 2 + static void flush_ptrace_access_other(void *args) { __flush_icache_all(); } -static -void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *kaddr, unsigned long len) +static inline +void __flush_ptrace_access(struct page *page, unsigned long uaddr, void *kaddr, + unsigned long len, unsigned int flags) { if (cache_is_vivt()) { - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { + if (flags & FLAG_PA_CORE_IN_MM) { unsigned long addr = (unsigned long)kaddr; __cpuc_coherent_kern_range(addr, addr + len); } @@ -128,7 +131,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } /* VIPT non-aliasing D-cache */ - if (vma->vm_flags & VM_EXEC) { + if (flags & FLAG_PA_IS_EXEC) { unsigned long addr = (unsigned long)kaddr; if (icache_is_vipt_aliasing()) flush_icache_alias(page_to_pfn(page), uaddr, len); @@ -140,6 +143,26 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } } +static +void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, + unsigned long uaddr, void *kaddr, unsigned long len) +{ + unsigned int flags = 0; + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) + flags |= FLAG_PA_CORE_IN_MM; + if (vma->vm_flags & VM_EXEC) + flags |= FLAG_PA_IS_EXEC; + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len) +{ + unsigned int flags = FLAG_PA_CORE_IN_MM|FLAG_PA_IS_EXEC; + + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + /* * Copy user data from/to a page which is mapped into a different * processes address space. Really, we want to allow our "user diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..c52f827ba6ce 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -32,6 +32,7 @@ struct vm_area_struct; struct mm_struct; struct inode; struct notifier_block; +struct page; #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 @@ -127,6 +128,8 @@ extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned l extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..4968213c63fa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1296,14 +1296,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, - &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - /* - * We probably need flush_icache_user_range() but it needs vma. - * This should work on supported architectures too. - */ - flush_dcache_page(area->page); + arch_uprobe_copy_ixol(area->page, xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; } @@ -1346,6 +1340,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) } } +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + copy_to_page(page, vaddr, src, len); + + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); +} + /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint -- cgit v1.2.3 From 27ddcc6596e50cb8f03d2e83248897667811d8f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:47 +0200 Subject: PM / sleep: Add state field to pm_states[] entries To allow sleep states corresponding to the "mem", "standby" and "freeze" lables to be different from the pm_states[] indexes of those strings, introduce struct pm_sleep_state, consisting of a string label and a state number, and turn pm_states[] into an array of objects of that type. This modification should not lead to any functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 16 ++++++++-------- kernel/power/power.h | 7 ++++++- kernel/power/suspend.c | 12 ++++++------ kernel/power/suspend_test.c | 22 ++++++++++------------ 4 files changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 6271bc4073ef..8e818432253c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -293,12 +293,12 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { char *s = buf; #ifdef CONFIG_SUSPEND - int i; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (valid_state(i)) + s += sprintf(s,"%s ", pm_states[i].label); - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } #endif #ifdef CONFIG_HIBERNATION s += sprintf(s, "%s\n", "disk"); @@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_MIN; - const char * const *s; + struct pm_sleep_state *s; #endif char *p; int len; @@ -328,7 +328,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (len == strlen(s->label) && !strncmp(buf, s->label, len)) return state; #endif @@ -448,7 +448,7 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sprintf(buf, "%s\n", valid_state(state) ? - pm_states[state] : "error"); + pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index 15f37ea08719..99539c5da844 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,8 +178,13 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND +struct pm_sleep_state { + const char *label; + suspend_state_t state; +}; + /* kernel/power/suspend.c */ -extern const char *const pm_states[]; +extern struct pm_sleep_state pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 155721f7f909..5d93b138a2d4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,10 +31,10 @@ #include "power.h" -const char *const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = "freeze", - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = { "freeze", PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { "standby", PM_SUSPEND_STANDBY }, + [PM_SUSPEND_MEM] = { "mem", PM_SUSPEND_MEM }, }; static const struct platform_suspend_ops *suspend_ops; @@ -343,7 +343,7 @@ static int enter_state(suspend_state_t state) sys_sync(); printk("done.\n"); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); if (error) goto Unlock; @@ -351,7 +351,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; - pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..d4e3ab167a73 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); } if (status < 0) @@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata = static int __init setup_test_suspend(char *value) { - unsigned i; + suspend_state_t i; /* "=mem" ==> "mem" */ value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_states[i].label, value)) { + test_state = pm_states[i].state; + return 0; + } + printk(warn_bad_state, value); return 0; } @@ -165,7 +163,7 @@ static int __init test_suspend(void) if (test_state == PM_SUSPEND_ON) goto done; if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); + printk(warn_bad_state, pm_states[test_state].label); goto done; } -- cgit v1.2.3 From 43e8317b0bba1d6eb85f38a4a233d82d7c20d732 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:53 +0200 Subject: PM / sleep: Use valid_state() for platform-dependent sleep states only Use the observation that, for platform-dependent sleep states (PM_SUSPEND_STANDBY, PM_SUSPEND_MEM), a given state is either always supported or always unsupported and store that information in pm_states[] instead of calling valid_state() every time we need to check it. Also do not use valid_state() for PM_SUSPEND_FREEZE, which is always valid, and move the pm_test_level validity check for PM_SUSPEND_FREEZE directly into enter_state(). Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 9 ++++--- kernel/power/power.h | 2 -- kernel/power/suspend.c | 60 ++++++++++++++++++++++----------------------- kernel/power/suspend_test.c | 2 +- 4 files changed, 36 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e818432253c..9f51f0ab3d86 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,7 +296,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (valid_state(i)) + if (pm_states[i].state) s += sprintf(s,"%s ", pm_states[i].label); #endif @@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n) #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (len == strlen(s->label) && !strncmp(buf, s->label, len)) - return state; + if (s->state && len == strlen(s->label) + && !strncmp(buf, s->label, len)) + return s->state; #endif return PM_SUSPEND_ON; @@ -447,7 +448,7 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", valid_state(state) ? + return sprintf(buf, "%s\n", pm_states[state].state ? pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/power.h b/kernel/power/power.h index 99539c5da844..c60f13b5270a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -186,14 +186,12 @@ struct pm_sleep_state { /* kernel/power/suspend.c */ extern struct pm_sleep_state pm_states[]; -extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM_TEST_SUSPEND diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5d93b138a2d4..00aca60904b0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,9 +32,9 @@ #include "power.h" struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { "freeze", PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { "standby", PM_SUSPEND_STANDBY }, - [PM_SUSPEND_MEM] = { "mem", PM_SUSPEND_MEM }, + [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { .label = "standby", }, + [PM_SUSPEND_MEM] = { .label = "mem", }, }; static const struct platform_suspend_ops *suspend_ops; @@ -68,42 +68,34 @@ void freeze_wake(void) } EXPORT_SYMBOL_GPL(freeze_wake); +static bool valid_state(suspend_state_t state) +{ + /* + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level + * support and need to be valid to the low level + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { + suspend_state_t i; + lock_system_sleep(); + suspend_ops = ops; + for (i = PM_SUSPEND_STANDBY; i <= PM_SUSPEND_MEM; i++) + pm_states[i].state = valid_state(i) ? i : 0; + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); -bool valid_state(suspend_state_t state) -{ - if (state == PM_SUSPEND_FREEZE) { -#ifdef CONFIG_PM_DEBUG - if (pm_test_level != TEST_NONE && - pm_test_level != TEST_FREEZER && - pm_test_level != TEST_DEVICES && - pm_test_level != TEST_PLATFORM) { - printk(KERN_WARNING "Unsupported pm_test mode for " - "freeze state, please choose " - "none/freezer/devices/platform.\n"); - return false; - } -#endif - return true; - } - /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel - * support and need to be valid to the lowlevel - * implementation, no valid callback implies that none are valid. - */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); -} - /** * suspend_valid_only_mem - Generic memory-only valid callback. * @@ -330,9 +322,17 @@ static int enter_state(suspend_state_t state) { int error; - if (!valid_state(state)) - return -ENODEV; - + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warning("PM: Unsupported test mode for freeze state," + "please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } if (!mutex_trylock(&pm_mutex)) return -EBUSY; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d4e3ab167a73..269b097e78ea 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -162,7 +162,7 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!valid_state(test_state)) { + if (!pm_states[test_state].state) { printk(warn_bad_state, pm_states[test_state].label); goto done; } -- cgit v1.2.3 From 0399d4db3edf5c58b6ec7f672f089f5085e49ed5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:59 +0200 Subject: PM / sleep: Introduce command line argument for sleep state enumeration On some systems the platform doesn't support neither PM_SUSPEND_MEM nor PM_SUSPEND_STANDBY, so PM_SUSPEND_FREEZE is the only available system sleep state. However, some user space frameworks only use the "mem" and (sometimes) "standby" sleep state labels, so the users of those systems need to modify user space in order to be able to use system suspend at all and that is not always possible. For this reason, add a new kernel command line argument, relative_sleep_states, allowing the users of those systems to change the way in which the kernel assigns labels to system sleep states. Namely, for relative_sleep_states=1, the "mem", "standby" and "freeze" labels will enumerate the available system sleem states from the deepest to the shallowest, respectively, so that "mem" is always present in /sys/power/state and the other state strings may or may not be presend depending on what is supported by the platform. Update system sleep states documentation to reflect this change. Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 29 ++++++++---- Documentation/kernel-parameters.txt | 7 +++ Documentation/power/states.txt | 87 ++++++++++++++++++++++------------- kernel/power/main.c | 12 ++--- kernel/power/suspend.c | 32 ++++++++++++- 5 files changed, 119 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 64c9276e9421..f4551816329e 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,19 +7,30 @@ Description: subsystem. What: /sys/power/state -Date: August 2006 +Date: May 2014 Contact: Rafael J. Wysocki Description: - The /sys/power/state file controls the system power state. - Reading from this file returns what states are supported, - which is hard-coded to 'freeze' (Low-Power Idle), 'standby' - (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk' - (Suspend-to-Disk). + The /sys/power/state file controls system sleep states. + Reading from this file returns the available sleep state + labels, which may be "mem", "standby", "freeze" and "disk" + (hibernation). The meanings of the first three labels depend on + the relative_sleep_states command line argument as follows: + 1) relative_sleep_states = 1 + "mem", "standby", "freeze" represent non-hibernation sleep + states from the deepest ("mem", always present) to the + shallowest ("freeze"). "standby" and "freeze" may or may + not be present depending on the capabilities of the + platform. "freeze" can only be present if "standby" is + present. + 2) relative_sleep_states = 0 (default) + "mem" - "suspend-to-RAM", present if supported. + "standby" - "power-on suspend", present if supported. + "freeze" - "suspend-to-idle", always present. Writing to this file one of these strings causes the system to - transition into that state. Please see the file - Documentation/power/states.txt for a description of each of - these states. + transition into the corresponding state, if available. See + Documentation/power/states.txt for a description of what + "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43842177b771..e19a88b63eeb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2889,6 +2889,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroups/cpusets.txt. + relative_sleep_states= + [SUSPEND] Use sleep state labeling where the deepest + state available other than hibernation is always "mem". + Format: { "0" | "1" } + 0 -- Traditional sleep state labels. + 1 -- Relative sleep state labels. + reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 442d43df9b25..50f3ef9177c1 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -1,62 +1,87 @@ +System Power Management Sleep States -System Power Management States +(C) 2014 Intel Corp., Rafael J. Wysocki +The kernel supports up to four system sleep states generically, although three +of them depend on the platform support code to implement the low-level details +for each state. -The kernel supports four power management states generically, though -one is generic and the other three are dependent on platform support -code to implement the low-level details for each state. -This file describes each state, what they are -commonly called, what ACPI state they map to, and what string to write -to /sys/power/state to enter that state +The states are represented by strings that can be read or written to the +/sys/power/state file. Those strings may be "mem", "standby", "freeze" and +"disk", where the last one always represents hibernation (Suspend-To-Disk) and +the meaning of the remaining ones depends on the relative_sleep_states command +line argument. -state: Freeze / Low-Power Idle +For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the +available non-hibernation sleep states from the deepest to the shallowest, +respectively. In that case, "mem" is always present in /sys/power/state, +because there is at least one non-hibernation sleep state in every system. If +the given system supports two non-hibernation sleep states, "standby" is present +in /sys/power/state in addition to "mem". If the system supports three +non-hibernation sleep states, "freeze" will be present in /sys/power/state in +addition to "mem" and "standby". + +For relative_sleep_states=0, which is the default, the following descriptions +apply. + +state: Suspend-To-Idle ACPI state: S0 -String: "freeze" +Label: "freeze" -This state is a generic, pure software, light-weight, low-power state. -It allows more energy to be saved relative to idle by freezing user +This state is a generic, pure software, light-weight, system sleep state. +It allows more energy to be saved relative to runtime idle by freezing user space and putting all I/O devices into low-power states (possibly lower-power than available at run time), such that the processors can spend more time in their idle states. -This state can be used for platforms without Standby/Suspend-to-RAM + +This state can be used for platforms without Power-On Suspend/Suspend-to-RAM support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. +to provide reduced resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -String: "standby" +Label: "standby" -This state offers minimal, though real, power savings, while providing -a very low-latency transition back to a working system. No operating -state is lost (the CPU retains power), so the system easily starts up +This state, if supported, offers moderate, though real, power savings, while +providing a relatively low-latency transition back to a working system. No +operating state is lost (the CPU retains power), so the system easily starts up again where it left off. -We try to put devices in a low-power state equivalent to D1, which -also offers low power savings, but low resume latency. Not all devices -support D1, and those that don't are left on. +In addition to freezing user space and putting all I/O devices into low-power +states, which is done for Suspend-To-Idle too, nonboot CPUs are taken offline +and all low-level system functions are suspended during transitions into this +state. For this reason, it should allow more energy to be saved relative to +Suspend-To-Idle, but the resume latency will generally be greater than for that +state. State: Suspend-to-RAM ACPI State: S3 -String: "mem" +Label: "mem" -This state offers significant power savings as everything in the -system is put into a low-power state, except for memory, which is -placed in self-refresh mode to retain its contents. +This state, if supported, offers significant power savings as everything in the +system is put into a low-power state, except for memory, which should be placed +into the self-refresh mode to retain its contents. All of the steps carried out +when entering Power-On Suspend are also carried out during transitions to STR. +Additional operations may take place depending on the platform capabilities. In +particular, on ACPI systems the kernel passes control to the BIOS (platform +firmware) as the last step during STR transitions and that usually results in +powering down some more low-level components that aren't directly controlled by +the kernel. -System and device state is saved and kept in memory. All devices are -suspended and put into D3. In many cases, all peripheral buses lose -power when entering STR, so devices must be able to handle the -transition back to the On state. +System and device state is saved and kept in memory. All devices are suspended +and put into low-power states. In many cases, all peripheral buses lose power +when entering STR, so devices must be able to handle the transition back to the +"on" state. -For at least ACPI, STR requires some minimal boot-strapping code to -resume the system from STR. This may be true on other platforms. +For at least ACPI, STR requires some minimal boot-strapping code to resume the +system from it. This may be the case on other platforms too. State: Suspend-to-disk ACPI State: S4 -String: "disk" +Label: "disk" This state offers the greatest power savings, and can be used even in the absence of low-level platform support for power management. This diff --git a/kernel/power/main.c b/kernel/power/main.c index 9f51f0ab3d86..573410d6647e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -279,14 +279,14 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; /** - * state - control system power state. + * state - control system sleep states. * - * show() returns what states are supported, which is hard-coded to - * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), - * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 00aca60904b0..338a6f147974 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -78,6 +78,26 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -85,12 +105,20 @@ static bool valid_state(suspend_state_t state) void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_STANDBY; i <= PM_SUSPEND_MEM; i++) - pm_states[i].state = valid_state(i) ? i : 0; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; unlock_system_sleep(); } -- cgit v1.2.3 From a257954bb38ab23dbf93df812b4b2c01cae29d8b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 27 May 2014 16:07:37 +0800 Subject: genirq: Improve documentation to match current implementation Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Paul Gortmaker Cc: Greg Kroah-Hartman Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Randy Dunlap Cc: Yinghai Lu Cc: Jiri Kosina Link: http://lkml.kernel.org/r/1401178092-1228-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- Documentation/IRQ-domain.txt | 3 +-- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt index 03df71aeb38c..8a8b82c9ca53 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/IRQ-domain.txt @@ -41,8 +41,7 @@ An interrupt controller driver creates and registers an irq_domain by calling one of the irq_domain_add_*() functions (each mapping method has a different allocator function, more on that later). The function will return a pointer to the irq_domain on success. The caller must -provide the allocator function with an irq_domain_ops structure with -the .map callback populated as a minimum. +provide the allocator function with an irq_domain_ops structure. In most cases, the irq_domain will begin empty without any mappings between hwirq and IRQ numbers. Mappings are added to the irq_domain diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c4065e3edbc3..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,7 +33,7 @@ enum { }; /* - * Bit masks for desc->state + * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain; * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Caller is expected to - * register allocated irq_domain with irq_domain_register(). Returns pointer - * to IRQ domain, or NULL on failure. + * Allocates and initialize and irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, irq_hw_number_t hwirq_max, int direct_max, -- cgit v1.2.3 From 26fc9cd200ec839e0b3095e05ae018f27314e7aa Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Sat, 26 Apr 2014 15:40:28 +0800 Subject: kernfs: move the last knowledge of sysfs out from kernfs There is still one residue of sysfs remaining: the sb_magic SYSFS_MAGIC. However this should be kernfs user specific, so this patch moves it out. Kerrnfs user should specify their magic number while mouting. Signed-off-by: Jianyu Zhan Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/mount.c | 11 ++++++----- fs/sysfs/mount.c | 4 +++- include/linux/kernfs.h | 13 ++++++++----- kernel/cgroup.c | 4 +++- 4 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f25a7c0c3cdc..d171b98a6cdd 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,7 +62,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } -static int kernfs_fill_super(struct super_block *sb) +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -71,7 +71,7 @@ static int kernfs_fill_super(struct super_block *sb) info->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -132,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -143,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -169,7 +170,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb); + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8794423f7efb..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include +#include #include #include #include @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index c841688a78a3..17aa1cce6f8e 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -301,8 +301,8 @@ void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns); + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); @@ -395,7 +395,8 @@ static inline const void *kernfs_super_ns(struct super_block *sb) static inline struct dentry * kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { return ERR_PTR(-ENOSYS); } static inline void kernfs_kill_sb(struct super_block *sb) { } @@ -453,9 +454,11 @@ static inline int kernfs_rename(struct kernfs_node *kn, static inline struct dentry * kernfs_mount(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created) { - return kernfs_mount_ns(fs_type, flags, root, new_sb_created, NULL); + return kernfs_mount_ns(fs_type, flags, root, + magic, new_sb_created, NULL); } #endif /* __LINUX_KERNFS_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f1ca934a237..ceee0c54c6a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1604,7 +1605,8 @@ out_unlock: if (ret) return ERR_PTR(ret); - dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + dentry = kernfs_mount(fs_type, flags, root->kf_root, + CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); return dentry; -- cgit v1.2.3 From 7fa21dd8bd191564a195291161d6b43db5d9c350 Mon Sep 17 00:00:00 2001 From: Stephen Chivers Date: Wed, 14 May 2014 08:04:39 +1000 Subject: printk/of_serial: fix serial console cessation part way through boot. Commit 5f5c9ae56c38942623f69c3e6dc6ec78e4da2076 "serial_core: Unregister console in uart_remove_one_port()" fixed a crash where a serial port was removed but not deregistered as a console. There is a side effect of that commit for platforms having serial consoles and of_serial configured (CONFIG_SERIAL_OF_PLATFORM). The serial console is disabled midway through the boot process. This cessation of the serial console affects PowerPC computers such as the MVME5100 and SAM440EP. The sequence is: bootconsole [udbg0] enabled .... serial8250/16550 driver initialises and registers its UARTS, one of these is the serial console. console [ttyS0] enabled .... of_serial probes "platform" devices, registering them as it goes. One of these is the serial console. console [ttyS0] disabled. The disabling of the serial console is due to: a. unregister_console in printk not clearing the CONS_ENABLED bit in the console flags, even though it has announced that the console is disabled; and b. of_platform_serial_probe in of_serial not setting the port type before it registers with serial8250_register_8250_port. This patch ensures that the serial console is re-enabled when of_serial registers a serial port that corresponds to the designated console. Signed-off-by: Stephen Chivers Tested-by: Stephen Chivers Acked-by: Geert Uytterhoeven [unregister_console] Cc: stable # 3.15 === The above failure was identified in Linux-3.15-rc2. Tested using MVME5100 and SAM440EP PowerPC computers with kernels built from Linux-3.15-rc5 and tty-next. The continued operation of the serial console is vital for computers such as the MVME5100 as that Single Board Computer does not have any grapical/display hardware. Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/of_serial.c | 1 + kernel/printk/printk.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c index 77ec6a147522..68d4455f3cf9 100644 --- a/drivers/tty/serial/of_serial.c +++ b/drivers/tty/serial/of_serial.c @@ -173,6 +173,7 @@ static int of_platform_serial_probe(struct platform_device *ofdev) { struct uart_8250_port port8250; memset(&port8250, 0, sizeof(port8250)); + port.type = port_type; port8250.port = port; if (port.fifosize) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..81c71617ea28 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2413,6 +2413,7 @@ int unregister_console(struct console *console) if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; + console->flags &= ~CON_ENABLED; console_unlock(); console_sysfs_notify(); return res; -- cgit v1.2.3 From 057b0a7518e4b8fca26201715996d6d928a62300 Mon Sep 17 00:00:00 2001 From: Niv Yehezkel Date: Sat, 31 May 2014 06:26:01 -0400 Subject: PM / hibernate: fixed typo in comment Fix a trivial comment typo (s/mam/map) in kernel/power/swap.c. Signed-off-by: Niv Yehezkel Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c9a4819f798..aaa3261dea5d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data) /** * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. + * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ -- cgit v1.2.3 From 8fe6929cfd43c44834858a53e129ffdc7c166298 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 4 Jun 2014 16:05:36 -0700 Subject: kthread: fix return value of kthread_create() upon SIGKILL. Commit 786235eeba0e ("kthread: make kthread_create() killable") meant for allowing kthread_create() to abort as soon as killed by the OOM-killer. But returning -ENOMEM is wrong if killed by SIGKILL from userspace. Change kthread_create() to return -EINTR upon SIGKILL. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Acked-by: David Rientjes Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * that thread. */ if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. -- cgit v1.2.3 From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++--- include/linux/memcontrol.h | 2 +- include/linux/slab.h | 11 +------- include/linux/thread_info.h | 2 -- include/trace/events/gfpflags.h | 1 - kernel/fork.c | 6 ++--- mm/memcontrol.c | 11 ++++---- mm/page_alloc.c | 56 +++++++++++++++++++++++++---------------- mm/slab_common.c | 13 ++++++++++ mm/slub.c | 6 ++--- 10 files changed, 68 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc7d01a..d382db71e300 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,7 +31,6 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -91,7 +90,6 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node) +extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); +extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, + unsigned int order); + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); -extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); -extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +extern void __free_kmem_pages(struct page *page, unsigned int order); +extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96e5d2573eb0..5155d09e749d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * res_counter_charge_nofail, but we hope those allocations are rare, * and won't be worth the trouble. */ - if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + if (gfp & __GFP_NOFAIL) return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe62387..a6aab2c0dfc5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, #include #endif -static __always_inline void * -kmalloc_order(size_t size, gfp_t flags, unsigned int order) -{ - void *ret; - - flags |= (__GFP_COMP | __GFP_KMEMCG); - ret = (void *) __get_free_pages(flags, order); - kmemleak_alloc(ret, size, 1, flags); - return ret; -} +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec94fda3..ff307b548ed3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif -#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) - /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1557f2..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,7 +34,6 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..59e3dcc5b8f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b3d5a8..7bab1de50f48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c6125632..1950c8f4d1a6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc9831851be6..ddb60795f373 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit v1.2.3 From f98bafa06a28fdfdd5c49f820f4d6560f636fc46 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:34 -0700 Subject: memcg: kill CONFIG_MM_OWNER CONFIG_MM_OWNER makes no sense. It is not user-selectable, it is only selected by CONFIG_MEMCG automatically. So we can kill this option in init/Kconfig and do s/CONFIG_MM_OWNER/CONFIG_MEMCG/ globally. Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- include/linux/sched.h | 4 ++-- init/Kconfig | 7 ------- kernel/exit.c | 4 ++-- kernel/fork.c | 4 ++-- 5 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..de1627232af0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,7 +406,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/include/linux/sched.h b/include/linux/sched.h index 70f67e4e6156..2f2dd7d932a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2967,7 +2967,7 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else @@ -2978,7 +2978,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) diff --git a/init/Kconfig b/init/Kconfig index 4a1822a1a680..0a2f09a80e90 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -933,7 +933,6 @@ config RESOURCE_COUNTERS config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS - select MM_OWNER select EVENTFD help Provides a memory resource controller that manages both anonymous @@ -951,9 +950,6 @@ config MEMCG disable memory resource controller and you can avoid overheads. (and lose benefits of memory resource controller) - This config option also selects MM_OWNER config option, which - could in turn add some fork/exit overhead. - config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" depends on MEMCG && SWAP @@ -1179,9 +1175,6 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. -config MM_OWNER - bool - config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..da1b838de8a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -352,7 +352,7 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -434,7 +434,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 59e3dcc5b8f2..0d53eb0dfb6f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. -- cgit v1.2.3 From f87fb599ae4d2a152a93f9821b94f3158146d097 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:52 -0700 Subject: memcg: mm_update_next_owner() should skip kthreads "Search through everything else" in mm_update_next_owner() can hit a kthread which adopted this "mm" via use_mm(), it should not be used as mm->owner. Add the PF_KTHREAD check. While at it, change this code to use for_each_process_thread() instead of deprecated do_each_thread/while_each_thread. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index da1b838de8a6..5ac3c19c245c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -395,14 +395,12 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) + for_each_process_thread(g, c) { + if (!(c->flags & PF_KTHREAD) && c->mm == mm) goto assign_new_owner; - } while_each_thread(g, c); - + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are -- cgit v1.2.3 From 39af1765f1255b2bbadc3064e16270781abf24a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:54 -0700 Subject: memcg: optimize the "Search everything else" loop in mm_update_next_owner() for_each_process_thread() is sub-optimal. All threads share the same ->mm, we can swicth to the next process once we found a thread with ->mm != NULL and ->mm != mm. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5ac3c19c245c..750c2e594617 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -397,9 +397,15 @@ retry: /* * Search through everything else, we should not get here often. */ - for_each_process_thread(g, c) { - if (!(c->flags & PF_KTHREAD) && c->mm == mm) - goto assign_new_owner; + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } } read_unlock(&tasklist_lock); /* -- cgit v1.2.3 From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:08 -0700 Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets If cpusets are not in use then we still check a global variable on every page allocation. Use jump labels to avoid the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 29 ++++++++++++++++++++++++++--- kernel/cpuset.c | 14 ++++---------- mm/page_alloc.c | 3 ++- 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index b19d3dc2e651..ade2390ffe92 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -12,10 +12,31 @@ #include #include #include +#include #ifdef CONFIG_CPUSETS -extern int number_of_cpusets; /* How many cpusets are defined in system? */ +extern struct static_key cpusets_enabled_key; +static inline bool cpusets_enabled(void) +{ + return static_key_false(&cpusets_enabled_key); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key) + 1; +} + +static inline void cpuset_inc(void) +{ + static_key_slow_inc(&cpusets_enabled_key); +} + +static inline void cpuset_dec(void) +{ + static_key_slow_dec(&cpusets_enabled_key); +} extern int cpuset_init(void); extern void cpuset_init_smp(void); @@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_softwall(node, gfp_mask); } static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_hardwall(node, gfp_mask); } @@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) #else /* !CONFIG_CPUSETS */ +static inline bool cpusets_enabled(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..130017843899 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include #include -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4381eaee715..a2955e101715 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1921,7 +1921,8 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -- cgit v1.2.3 From f6187769dae48234f3877df3c4d99294cc2254fa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:12 -0700 Subject: sys_sgetmask/sys_ssetmask: add CONFIG_SGETMASK_SYSCALL sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc. This patch replaces architecture related __ARCH_WANT_SYS_SGETMAX by expert mode configuration.That option is enabled by default for those architectures. Signed-off-by: Fabian Frederick Cc: Steven Miao Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Koichi Yasutake Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Greg Ungerer Cc: Heiko Carstens Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/unistd.h | 1 - arch/cris/include/asm/unistd.h | 1 - arch/frv/include/asm/unistd.h | 1 - arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/include/asm/unistd.h | 1 - arch/mips/include/asm/unistd.h | 1 - arch/mn10300/include/asm/unistd.h | 1 - arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/include/asm/unistd.h | 1 - arch/sh/include/asm/unistd.h | 1 - arch/sparc/include/asm/unistd.h | 1 - arch/x86/include/asm/unistd.h | 1 - init/Kconfig | 10 ++++++++++ kernel/signal.c | 4 ++-- kernel/sys_ni.c | 2 ++ 15 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c35414bdf7bd..c8c8ff9eff61 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -12,7 +12,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 5cc7d1991e48..0f40fed1ba25 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -15,7 +15,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 70ec7293dce7..17b5df8fc28a 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -13,7 +13,6 @@ /* #define __ARCH_WANT_SYS_GETHOSTNAME */ #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -/* #define __ARCH_WANT_SYS_SGETMASK */ /* #define __ARCH_WANT_SYS_SIGNAL */ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 33afa56ad47a..1fcdd344c7ad 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -13,7 +13,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index b14232b6878f..fd56a8f66489 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -19,7 +19,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 413d6c612bec..e55813029d5a 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_WAITPID #define __ARCH_WANT_SYS_SOCKETCALL diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9d4e2d1ef90e..0522468f488b 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -26,7 +26,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 74d835820ee7..5f4c68daa261 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 9b892bbd9d84..5ce5552ab9f5 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e77816c4b9bc..126fe8340b22 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -11,7 +11,6 @@ # define __ARCH_WANT_SYS_GETHOSTNAME # define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index dfa53fdd5cbc..0aac1e8f2968 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -25,7 +25,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 3f556c6a0157..2b19caa4081c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -41,7 +41,6 @@ # define __ARCH_WANT_SYS_OLD_GETRLIMIT # define __ARCH_WANT_SYS_OLD_UNAME # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/init/Kconfig b/init/Kconfig index ce034ad4a162..9d76b99af1b9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1313,6 +1313,16 @@ config UID16 help This enables the legacy 16-bit UID syscall wrappers. +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + config SYSFS_SYSCALL bool "Sysfs syscall support" if EXPERT default y diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..6e600aaa2af4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); -- cgit v1.2.3 From 84117da5b79ffb4077bb05d64c86dfa4d746115c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:17 -0700 Subject: kernel/cpu.c: convert printk to pr_foo() no level printk converted to pr_warn (if err) no level printk converted to pr_info (disabling non-boot cpus) Other printk converted to respective level. Signed-off-by: Fabian Frederick Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 247979a1b815..acf791c55b71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu) task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (utime || stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", + pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, p->state, p->flags); } @@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); goto out_release; } @@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu) int err = 0; if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); + pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", + cpu); #if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); + pr_err("please check additional_cpus= boot parameter\n"); #endif return -EINVAL; } @@ -518,7 +516,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); + pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -526,8 +524,7 @@ int disable_nonboot_cpus(void) if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); + pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } @@ -537,7 +534,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + pr_err("Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); return error; @@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + pr_info("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); + pr_info("CPU%d is up\n", cpu); continue; } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_enable_nonboot_cpus_end(); -- cgit v1.2.3 From 462b29b8564c489e0aa3f5a3a505fd2776af5e55 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:18 -0700 Subject: kernel/backtracetest.c: replace no level printk by pr_info() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/backtracetest.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@ static void backtrace_test_normal(void) { - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from process context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); static void backtrace_test_irq(void) { - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from irq context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); init_completion(&backtrace_work); tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) struct stack_trace trace; unsigned long entries[8]; - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a saved backtrace.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); trace.nr_entries = 0; trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) #else static void backtrace_test_saved(void) { - printk("Saved backtrace test skipped.\n"); + pr_info("Saved backtrace test skipped.\n"); } #endif static int backtrace_regression_test(void) { - printk("====[ backtrace testing ]===========\n"); + pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); backtrace_test_irq(); backtrace_test_saved(); - printk("====[ end of backtrace testing ]====\n"); + pr_info("====[ end of backtrace testing ]====\n"); return 0; } -- cgit v1.2.3 From a6c8c6902c53e620e607e83f520e9ae424e2a424 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:19 -0700 Subject: kernel/capability.c: code clean-up - EXPORT_SYMBOL - typo: unexpectidly->unexpectedly - function prototype over 80 characters Signed-off-by: Fabian Frederick Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..84b2bbf443e7 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -24,7 +24,6 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts + * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, + int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; -- cgit v1.2.3 From b9e5db6d2bbe4416cd1c30c2d1891ef39d6bd0b7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:20 -0700 Subject: kernel/exec_domain.c: code clean-up Fix checkpatch warnings about EXPORT_SYMBOL and return() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exec_domain.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { struct exec_domain default_exec_domain = { .name = "Linux", /* name */ .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ + .pers_low = 0, /* PER_LINUX personality. */ .pers_high = 0, /* PER_LINUX personality. */ .signal_map = ident_map, /* Identity map signals. */ .signal_invmap = ident_map, /* - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) ep = &default_exec_domain; out: read_unlock(&exec_domains_lock); - return (ep); + return ep; } int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) out: write_unlock(&exec_domains_lock); - return (err); + return err; } +EXPORT_SYMBOL(register_exec_domain); int unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister: write_unlock(&exec_domains_lock); return 0; } +EXPORT_SYMBOL(unregister_exec_domain); int __set_personality(unsigned int personality) { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) return 0; } +EXPORT_SYMBOL(__set_personality); #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) return old; } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); -- cgit v1.2.3 From eaa1809b900c460a020bff1f4030f4f6a237b2b2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:21 -0700 Subject: kernel/latencytop.c: convert seq_printf to seq_puts This patch also fixes one function declaration over 80 characters. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) } static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, + struct latency_record *lat) { int firstnonnull = MAXLR + 1; int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) break; seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } } return 0; -- cgit v1.2.3 From cf25004069d3ccd6aae607d8175bdff67c1dd319 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:22 -0700 Subject: kernel/stop_machine.c: kernel-doc warning fix Signed-off-by: Fabian Frederick Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused -- cgit v1.2.3 From cac92ba74f19fd58a28976f753f9327f27cf1669 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:23 -0700 Subject: kernel/tracepoint.c: kernel-doc fixes Signed-off-by: Fabian Frederick Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tracepoint.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6620e5837ce2..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler + * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer + * @data: tracepoint data * * Returns 0 if ok, error value on error. */ -- cgit v1.2.3 From 6c5a53c67057bddf7f8e26c93a8e045215f61539 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:24 -0700 Subject: kernel/res_counter.c: replace simple_strtoull by kstrtoull [akpm@linux-foundation.org: don't overwrite kstrtoull()'s errno] Signed-off-by: Fabian Frederick Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - res = simple_strtoull(buf + 1, &end, 10); - if (res != 1 || *end != '\0') + int rc = kstrtoull(buf + 1, 10, &res); + + if (rc) + return rc; + if (res != 1) return -EINVAL; *resp = RES_COUNTER_MAX; return 0; -- cgit v1.2.3 From 616feab753972b9751308f3cd2a68fc57eae8edb Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:25 -0700 Subject: kernel/reboot.c: convert simple_strtoul to kstrtoint Replace obsolete function. kstrtoint is used as reboot_cpu is an integer. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + { + int rc; + + if (isdigit(*(str+1))) { + rc = kstrtoint(str+1, 0, &reboot_cpu); + if (rc) + return rc; + } else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) { + rc = kstrtoint(str+3, 0, &reboot_cpu); + if (rc) + return rc; + } else reboot_mode = REBOOT_SOFT; break; - + } case 'g': reboot_mode = REBOOT_GPIO; break; -- cgit v1.2.3 From 95583e4ab5745218373add88ffddb70faff2d0c8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/utsname_sysctl.c: replace obsolete __initcall by device_initcall Also fixes checkpatch warnings on proc_dostring function parameters Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/utsname_sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..6fbe811c7ad1 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -51,7 +51,7 @@ static int proc_do_uts_string(ctl_table *table, int write, int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); put_uts(table, write, uts_table.data); if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) return 0; } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); -- cgit v1.2.3 From b51dbec68c8732caac2495f558659556523e8322 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/hung_task.c: convert simple_strtoul to kstrtouint sysctl_hung_task_panic has been changed to unsigned int. use kstrtouint instead of obsolete simple_strtoul Signed-off-by: Fabian Frederick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = static int __init hung_task_panic_setup(char *str) { - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); + if (rc) + return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); -- cgit v1.2.3 From b300a4ea665f7fa44f015616ac1874deca891c5e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:11:27 -0700 Subject: kernel/user.c: drop unused field 'files' from user_struct Nobody seems uses it for a long time. Let's drop it. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/user.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f2dd7d932a2..611676fd4c2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -745,7 +745,6 @@ static inline int signal_group_exit(const struct signal_struct *sig) struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, -- cgit v1.2.3 From 0a581694ab7a5bc083d710df8a552a6a055b005f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:28 -0700 Subject: printk: split code for making free space in the log buffer The check for free space in the log buffer always passes when "first_seq" and "next_seq" are equal. In theory, it might cause writing outside of the log buffer. Fortunately, the current usage looks safe because the used "text" and "dict" buffers are quite limited. See the second patch for more details. Anyway, it is better to be on the safe side and add a check. An easy solution is done in the 2nd patch and it is improved in the 4th patch. 5th patch fixes the computation of the printed message length. 1st and 3rd patches just do some code refactoring to make the other patches easier. This patch (of 5): There will be needed some fixes in the check for free space. They will be easier if the code is moved outside of the quite long log_store() function. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 221229cf0190..99b7a2d87b6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,6 +297,34 @@ static u32 log_next(u32 idx) return idx + msg->len; } +/* check whether there is enough free space for the given message */ +static int logbuf_has_space(u32 msg_size) +{ + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + /* + * We need space also for an empty header that signalizes wrapping + * of the buffer. + */ + return free >= msg_size + sizeof(struct printk_log); +} + +static void log_make_free_space(u32 msg_size) +{ + while (log_first_seq < log_next_seq) { + if (logbuf_has_space(msg_size)) + return; + /* drop old messages until we have enough continuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -311,21 +339,7 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - while (log_first_seq < log_next_seq) { - u32 free; - - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - if (free >= size + sizeof(struct printk_log)) - break; - - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } + log_make_free_space(size); if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit v1.2.3 From f40e4b9f70d48eb08f443642283fdd9d05b27c6d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:30 -0700 Subject: printk: ignore too long messages There was no check for too long messages. The check for free space always passed when first_seq and next_seq were equal. Enough free space was not guaranteed, though. log_store() might be called to store messages up to 64kB + 64kB + 16B. This is sum of maximal text_len, dict_len values, and the size of the structure printk_log. On the other hand, the minimal size for the main log buffer currently is 4kB and it is enforced only by Kconfig. The good news is that the usage looks safe right now. log_store() is called only from vprintk_emit() and cont_flush(). Here the "text" part is always passed via a static buffer and the length is limited to LOG_LINE_MAX which is 1024. The "dict" part is NULL in most cases. The only exceptions is when vprintk_emit() is called from printk_emit() and dev_vprintk_emit(). But printk_emit() is currently used only in devkmsg_writev() and here "dict" is NULL as well. In dev_vprintk_emit(), "dict" is limited by the static buffer "hdr" of the size 128 bytes. It meas that the current maximal printed text is 1024B + 128B + 16B and it always fit the log buffer. But it is only matter of time when someone calls printk_emit() with unsafe parameters, especially the "dict" one. This patch adds a check for the free space when the buffer is empty. It reuses the already existing log_has_space() function but it has to add an extra parameter. It defines whether the buffer is empty. Note that the same values of "first_idx" and "next_idx" might also mean that the buffer is full. If the buffer is empty, we must respect the current position of the indexes. We cannot reset them to the beginning of the buffer. Otherwise, the functions reading the buffer would get crazy. The question is what to do when the message is too long. This patch uses the easiest solution and just ignores the problematic message. Let's do something better in a followup patch. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 99b7a2d87b6a..8fbbab1771eb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,12 +297,20 @@ static u32 log_next(u32 idx) return idx + msg->len; } -/* check whether there is enough free space for the given message */ -static int logbuf_has_space(u32 msg_size) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty) { u32 free; - if (log_next_idx > log_first_idx) + if (log_next_idx > log_first_idx || empty) free = max(log_buf_len - log_next_idx, log_first_idx); else free = log_first_idx - log_next_idx; @@ -314,15 +322,21 @@ static int logbuf_has_space(u32 msg_size) return free >= msg_size + sizeof(struct printk_log); } -static void log_make_free_space(u32 msg_size) +static int log_make_free_space(u32 msg_size) { while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size)) - return; + if (logbuf_has_space(msg_size, false)) + return 0; /* drop old messages until we have enough continuous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + + /* sequence numbers are equal, so the log buffer is empty */ + if (logbuf_has_space(msg_size, true)) + return 0; + + return -ENOMEM; } /* insert record into the buffer, discard old ones, update heads */ @@ -339,7 +353,9 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - log_make_free_space(size); + /* if message does not fit empty log buffer, ignore it */ + if (log_make_free_space(size)) + return; if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit v1.2.3 From 85c87043023b7e5535f975bbee12a4f5399df520 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:31 -0700 Subject: printk: split message size computation We will want to recompute the message size when shrinking too long messages. Let's put the code into separate function. The side effect of setting "pad_len" is not nice but it is worth removing the code duplication. Note that I will probably have one more usage for this function when handling messages safe way in NMI context. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8fbbab1771eb..9f088ed8404b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -339,6 +339,18 @@ static int log_make_free_space(u32 msg_size) return -ENOMEM; } +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ + u32 size; + + size = sizeof(struct printk_log) + text_len + dict_len; + *pad_len = (-size) & (LOG_ALIGN - 1); + size += *pad_len; + + return size; +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -349,9 +361,7 @@ static void log_store(int facility, int level, u32 size, pad_len; /* number of '\0' padding bytes to next message */ - size = sizeof(struct printk_log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; + size = msg_used_size(text_len, dict_len, &pad_len); /* if message does not fit empty log buffer, ignore it */ if (log_make_free_space(size)) -- cgit v1.2.3 From 55bd53a4eb3dd18be8744f8b4d026068fc801a62 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:32 -0700 Subject: printk: shrink too long messages We might want to print at least part of too long messages and add some warning for debugging purpose. The question is how long the shrunken message should be. If we use the whole buffer, it might get rotated too soon. Let's try to use only 1/4 of the buffer for now. Also shrink the whole dictionary. We do not want to parse it or break it in the middle of some pair of values. It would not cause any real harm but still. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9f088ed8404b..7131dd4d0e3a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,6 +351,32 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) return size; } +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = ""; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, + u16 *dict_len, u32 *pad_len) +{ + /* + * The message should not take the whole buffer. Otherwise, it might + * get removed too soon. + */ + u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) + *text_len = max_text_len; + /* enable the warning message */ + *trunc_msg_len = strlen(trunc_msg); + /* disable the "dict" completely */ + *dict_len = 0; + /* compute the size again, count also the warning message */ + return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -359,13 +385,19 @@ static void log_store(int facility, int level, { struct printk_log *msg; u32 size, pad_len; + u16 trunc_msg_len = 0; /* number of '\0' padding bytes to next message */ size = msg_used_size(text_len, dict_len, &pad_len); - /* if message does not fit empty log buffer, ignore it */ - if (log_make_free_space(size)) - return; + if (log_make_free_space(size)) { + /* truncate the message if it is too long for empty buffer */ + size = truncate_msg(&text_len, &trunc_msg_len, + &dict_len, &pad_len); + /* survive when the log buffer is too small for trunc_msg */ + if (log_make_free_space(size)) + return; + } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* @@ -381,6 +413,10 @@ static void log_store(int facility, int level, msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; + if (trunc_msg_len) { + memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); + msg->text_len += trunc_msg_len; + } memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; msg->facility = facility; -- cgit v1.2.3 From 034633ccb24d675850f99bf85c1c5880c831e4b6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:33 -0700 Subject: printk: return really stored message length I wonder if anyone uses printk return value but it is there and should be counted correctly. This patch modifies log_store() to return the number of really stored bytes from the 'text' part. Also it handles the return value in vprintk_emit(). Note that log_store() is used also in cont_flush() but we could ignore the return value there. The function works with characters that were already counted earlier. In addition, the store could newer fail here because the length of the printed text is limited by the "cont" buffer and "dict" is NULL. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7131dd4d0e3a..7476a53bc378 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -378,10 +378,10 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) +static int log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) { struct printk_log *msg; u32 size, pad_len; @@ -396,7 +396,7 @@ static void log_store(int facility, int level, &dict_len, &pad_len); /* survive when the log buffer is too small for trunc_msg */ if (log_make_free_space(size)) - return; + return 0; } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { @@ -432,6 +432,8 @@ static void log_store(int facility, int level, /* insert message */ log_next_idx += msg->len; log_next_seq++; + + return msg->text_len; } #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1606,10 +1608,10 @@ asmlinkage int vprintk_emit(int facility, int level, "BUG: recent printk recursion!"; recursion_bug = 0; - printed_len += strlen(recursion_msg); + text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, text_len); } /* @@ -1662,9 +1664,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); + if (cont_add(facility, level, text, text_len)) + printed_len += text_len; + else + printed_len += log_store(facility, level, + lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { bool stored = false; @@ -1683,11 +1688,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); } - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); + if (stored) + printed_len += text_len; + else + printed_len += log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } - printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. -- cgit v1.2.3 From ca1d432ad8a527fabc5c7ceed8526e3a28de121c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:34 -0700 Subject: printk: remove outdated comment Comment about interesting interlocking between lockbuf_lock and console_sem is outdated. It was added in 2002 by commit a880f45a48be during conversion of console_lock to console_sem + lockbuf_lock. At that time release_console_sem() (today's equivalent is console_unlock()) was indeed using lockbuf_lock to avoid races between trylock on console_sem in printk() and unlock of console_sem. However these days the interlocking is gone and the races are avoided by rechecking logbuf state after releasing console_sem. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7476a53bc378..5bc54478c963 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -206,8 +206,7 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); -- cgit v1.2.3 From 608873cacb9d0d2811586fcc79a38b64eabd6d32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:35 -0700 Subject: printk: release lockbuf_lock before calling console_trylock_for_printk() There's no reason to hold lockbuf_lock when entering console_trylock_for_printk(). The first thing this function does is to call down_trylock(console_sem) and if that fails it immediately unlocks lockbuf_lock. So lockbuf_lock isn't needed for that branch. When down_trylock() succeeds, the rest of console_trylock() is OK without lockbuf_lock (it is called without it from other places), and the only remaining thing in console_trylock_for_printk() is can_use_console() call. For that call console_sem is enough (it iterates all consoles and checks CON_ANYTIME flag). So we drop logbuf_lock before entering console_trylock_for_printk() which simplifies the code. [akpm@linux-foundation.org: fix have_callable_console() comment] Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 54 ++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5bc54478c963..6e1b21a8a497 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -249,9 +249,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -1380,7 +1377,10 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ static int have_callable_console(void) { struct console *con; @@ -1410,36 +1410,22 @@ static inline int can_use_console(unsigned int cpu) * messages from a 'printk'. Return true (and with the * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. */ static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) { - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } - } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) + if (!console_trylock()) + return 0; + /* + * If we can't use the console, we need to release the console + * semaphore by hand to avoid flushing the buffer. We need to hold the + * console semaphore in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; up(&console_sem); - return retval; + return 0; + } + return 1; } int printk_delay_msec __read_mostly; @@ -1572,6 +1558,9 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + /* cpu currently holding logbuf_lock in this function */ + static volatile unsigned int logbuf_cpu = UINT_MAX; + boot_delay_msec(level); printk_delay(); @@ -1694,13 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level, dict, dictlen, text, text_len); } + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. */ if (console_trylock_for_printk(this_cpu)) console_unlock(); -- cgit v1.2.3 From bd8d7cf5b8410fe98eba06a9aaa90efe88815d8a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:36 -0700 Subject: printk: fix lockdep instrumentation of console_sem Printk calls mutex_acquire() / mutex_release() by hand to instrument lockdep about console_sem. However in some corner cases the instrumentation is missing. Fix the problem by creating helper functions for locking / unlocking console_sem which take care of lockdep instrumentation as well. Signed-off-by: Jan Kara Reported-by: Fabio Estevam Reported-by: Andy Shevchenko Tested-by: Fabio Estevam Tested-By: Valdis Kletnieks Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6e1b21a8a497..5ba37f813723 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -90,6 +90,29 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +/* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ + down(&console_sem);\ + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ + if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ + mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ + up(&console_sem);\ +} while (0) + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1422,7 +1445,7 @@ static int console_trylock_for_printk(unsigned int cpu) */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + up_console_sem(); return 0; } return 1; @@ -1951,16 +1974,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; - up(&console_sem); - mutex_release(&console_lock_dep_map, 1, _RET_IP_); + up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); + down_console_sem(); console_suspended = 0; console_unlock(); } @@ -2002,12 +2023,11 @@ void console_lock(void) { might_sleep(); - down(&console_sem); + down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -2021,15 +2041,14 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock(&console_sem)) + if (down_trylock_console_sem()) return 0; if (console_suspended) { - up(&console_sem); + up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2091,7 +2110,7 @@ void console_unlock(void) bool retry; if (console_suspended) { - up(&console_sem); + up_console_sem(); return; } @@ -2153,7 +2172,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2161,7 +2179,7 @@ skip: raw_spin_unlock(&logbuf_lock); - up(&console_sem); + up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's @@ -2206,7 +2224,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (down_trylock_console_sem() != 0) return; } else console_lock(); -- cgit v1.2.3 From 939f04bec1a4ef6ba4370b0f34b01decc844b1b1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:37 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock() especially since later in the patch series we will want to spin on console_sem in console_trylock_for_printk(). Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5ba37f813723..4e22230f1f6c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1418,10 +1418,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1434,8 +1433,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1605,7 +1606,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1708,17 +1710,22 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); + + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); + preempt_enable(); return printed_len; } -- cgit v1.2.3 From 458df9fd4815b47809875d57f42e16401674b621 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Jun 2014 16:11:38 -0700 Subject: printk: remove separate printk_sched buffers and use printk buf instead To prevent deadlocks with doing a printk inside the scheduler, printk_sched() was created. The issue is that printk has a console_sem that it can grab and release. The release does a wake up if there's a task pending on the sem, and this wake up grabs the rq locks that is held in the scheduler. This leads to a possible deadlock if the wake up uses the same rq as the one with the rq lock held already. What printk_sched() does is to save the printk write in a per cpu buffer and sets the PRINTK_PENDING_SCHED flag. On a timer tick, if this flag is set, the printk() is done against the buffer. There's a couple of issues with this approach. 1) If two printk_sched()s are called before the tick, the second one will overwrite the first one. 2) The temporary buffer is 512 bytes and is per cpu. This is a quite a bit of space wasted for something that is seldom used. In order to remove this, the printk_sched() can use the printk buffer instead, and delay the console_trylock()/console_unlock() to the queued work. Because printk_sched() would then be taking the logbuf_lock, the logbuf_lock must not be held while doing anything that may call into the scheduler functions, which includes wake ups. Unfortunately, printk() also has a console_sem that it uses, and on release, the up(&console_sem) may do a wake up of any pending waiters. This must be avoided while holding the logbuf_lock. Signed-off-by: Steven Rostedt Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4e22230f1f6c..247b0c1fadfc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -68,6 +68,9 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -229,7 +232,9 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. + * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -1577,14 +1582,19 @@ asmlinkage int vprintk_emit(int facility, int level, static int recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len; + size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; int this_cpu; int printed_len = 0; + bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static volatile unsigned int logbuf_cpu = UINT_MAX; + if (level == SCHED_MESSAGE_LOGLEVEL) { + level = -1; + in_sched = true; + } boot_delay_msec(level); printk_delay(); @@ -1631,7 +1641,12 @@ asmlinkage int vprintk_emit(int facility, int level, * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + if (in_sched) + text_len = scnprintf(text, sizeof(textbuf), + KERN_WARNING "[sched_delayed] "); + + text_len += vscnprintf(text + text_len, + sizeof(textbuf) - text_len, fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1713,6 +1728,10 @@ asmlinkage int vprintk_emit(int facility, int level, lockdep_on(); local_irq_restore(flags); + /* If called from the scheduler, we can not call up(). */ + if (in_sched) + return printed_len; + /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to console @@ -2532,21 +2551,19 @@ late_initcall(printk_late_init); /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_BUF_SIZE 512 - #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + if (pending & PRINTK_PENDING_OUTPUT) { + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -2570,21 +2587,15 @@ void wake_up_klogd(void) int printk_sched(const char *fmt, ...) { - unsigned long flags; va_list args; - char *buf; int r; - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); return r; } -- cgit v1.2.3 From 81954606265ab8f04b41154bd00576013affcf5b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:39 -0700 Subject: printk: disable preemption for printk_sched An earlier change in -mm (printk: remove separate printk_sched buffers...), removed the printk_sched irqsave/restore lines since it was safe for current users. Since we may be expanding usage of printk_sched(), disable preepmtion for this function to make it more generally safe to call. Signed-off-by: John Stultz Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247b0c1fadfc..dc2b8bd9bc1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2590,12 +2590,14 @@ int printk_sched(const char *fmt, ...) va_list args; int r; + preempt_disable(); va_start(args, fmt); r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + preempt_enable(); return r; } -- cgit v1.2.3 From aac74dc495456412c4130a1167ce4beb6c1f0b38 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:40 -0700 Subject: printk: rename printk_sched to printk_deferred After learning we'll need some sort of deferred printk functionality in the timekeeping core, Peter suggested we rename the printk_sched function so it can be reused by needed subsystems. This only changes the function name. No logic changes. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 6 +++--- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/rt.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8752f7595b27..7847301e2837 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -128,9 +128,9 @@ asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* - * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ -__printf(1, 2) __cold int printk_sched(const char *fmt, ...); +__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -165,7 +165,7 @@ int printk(const char *s, ...) return 0; } static inline __printf(1, 2) __cold -int printk_sched(const char *s, ...) +int printk_deferred(const char *s, ...) { return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dc2b8bd9bc1e..35d9db251903 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2585,7 +2585,7 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { va_list args; int r; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6cc2c1..caf03e89a068 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1367,7 +1367,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..d17e1c48a79d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, if (!lag_once) { lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); + printk_deferred("sched: DL replenish lagged to much\n"); } dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..5d7667b37c21 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -896,7 +896,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (!once) { once = true; - printk_sched("sched: RT throttling activated\n"); + printk_deferred("sched: RT throttling activated\n"); } } else { /* -- cgit v1.2.3 From c224815dac9c739b79050d3cc67443ff500bc478 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:41 -0700 Subject: printk: Add printk_deferred_once Two of the three prink_deferred uses are really printk_once style uses, so add a printk_deferred_once macro to simplify those call sites. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 11 +++++++++++ kernel/sched/deadline.c | 7 +------ kernel/sched/rt.c | 8 +------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 7847301e2837..f086d6c99dbc 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,9 +266,20 @@ extern asmlinkage void dump_stack(void) __cold; printk(fmt, ##__VA_ARGS__); \ } \ }) +#define printk_deferred_once(fmt, ...) \ +({ \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + printk_deferred(fmt, ##__VA_ARGS__); \ + } \ +}) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) +#define printk_deferred_once(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d17e1c48a79d..e1574fca03b5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - static bool lag_once = false; - - if (!lag_once) { - lag_once = true; - printk_deferred("sched: DL replenish lagged to much\n"); - } + printk_deferred_once("sched: DL replenish lagged to much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5d7667b37c21..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_deferred("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, -- cgit v1.2.3 From 6d9bcb621b0b0a20604cbdb298c4487e44dd0da2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:43 -0700 Subject: timekeeping: use printk_deferred when holding timekeeping seqlock Jiri Bohac pointed out that there are rare but potential deadlock possibilities when calling printk while holding the timekeeping seqlock. This is due to printk() triggering console sem wakeup, which can cause scheduling code to trigger hrtimers which may try to read the time. Specifically, as Jiri pointed out, that path is: printk vprintk_emit console_unlock up(&console_sem) __up wake_up_process try_to_wake_up ttwu_do_activate ttwu_activate activate_task enqueue_task enqueue_task_fair hrtick_update hrtick_start_fair hrtick_start_fair get_time ktime_get --> endless loop on read_seqcount_retry(&timekeeper_seq, ...) This patch tries to avoid this issue by using printk_deferred (previously named printk_sched) which should defer printing via a irq_work_queue. Signed-off-by: John Stultz Reported-by: Jiri Bohac Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 15 +++++++++------ kernel/time/timekeeping.c | 7 ++++--- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..5b0ac4de3822 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); -- cgit v1.2.3 From 84b5ec8a9df86f3dcaaaf912715db35e4852d1da Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 4 Jun 2014 16:11:45 -0700 Subject: printk: report dropping of messages from logbuf If the log ring buffer becomes full, we silently overwrite old messages with new data. console_unlock will detect this case and fast-forward the console_* pointers to skip over the corrupted data, but nothing will be reported to the user. This patch hijacks the first valid log message after detecting that we dropped messages and prefixes it with a note detailing how many messages were dropped. For long (~1000 char) messages, this will result in some truncation of the real message, but given that we're dropping things anyway, that doesn't seem to be the end of the world. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Kay Sievers Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 35d9db251903..923c5d4e4202 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2157,10 +2157,15 @@ again: } if (console_seq < log_first_seq) { + len = sprintf(text, "** %u printk messages dropped ** ", + (unsigned)(log_first_seq - console_seq)); + /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; console_prev = 0; + } else { + len = 0; } skip: if (console_seq == log_next_seq) @@ -2185,8 +2190,8 @@ skip: } level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); + len += msg_print_text(msg, console_prev, false, + text + len, sizeof(text) - len); console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; -- cgit v1.2.3 From a8fe19ebfbfd90ec17c02284717238b02efb9580 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Jun 2014 16:11:46 -0700 Subject: kernel/printk: use symbolic defines for console loglevels ... instead of naked numbers. Stuff in sysrq.c used to set it to 8 which is supposed to mean above default level so set it to DEBUG instead as we're terminating/killing all tasks and we want to be verbose there. Also, correct the check in x86_64_start_kernel which should be >= as we're clearly issuing the string there for all debug levels, not only the magical 10. Signed-off-by: Borislav Petkov Acked-by: Kees Cook Acked-by: Randy Dunlap Cc: Joe Perches Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/head64.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- drivers/nubus/nubus.c | 18 +++++++++--------- drivers/tty/sysrq.c | 8 ++++---- include/linux/printk.h | 15 +++++++++++++-- init/main.c | 4 ++-- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/printk/printk.c | 13 +++---------- 10 files changed, 36 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f4bf20..eda1a865641e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel == 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) early_printk("Kernel alive\n"); clear_page(init_level4_pgt); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index be27da60dc8f..c89c93320c12 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask; * Default is all stack dumps go to the console and buffer. * Lower level to send to log buffer only. */ -static int uv_nmi_loglevel = 7; +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); /* diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index 43926cd25ae8..5066a7ef7b6c 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -473,7 +473,7 @@ static struct nubus_dev* __init if (slot == 0 && (unsigned long)dir.base % 2) dir.base += 1; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board, printk(KERN_INFO " video modes supported:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board, printk(KERN_INFO " vendor info:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot, struct nubus_dirent ent; nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* This one takes us to where we want to go. */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); nubus_get_subdir(&ent, &dir); /* Resource ID 01, also an "Unknown Macintosh" */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* FIXME: the first one is *not* always the right one. We @@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) path to that address... */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* Bwahahahaha... */ @@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes) board->fblock = rp; /* Dump the format block for debugging purposes */ - if (console_loglevel >= 10) { + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) { int i; printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", slot, rp); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ecdf412..b767a64e49d9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key) int i; i = key - '0'; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk("Loglevel set to %d\n", i); console_loglevel = i; } @@ -343,7 +343,7 @@ static void send_sig_all(int sig) static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, @@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = { static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, @@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask) * routing in the consumers of /proc/kmsg. */ orig_log_level = console_loglevel; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); diff --git a/include/linux/printk.h b/include/linux/printk.h index 37f3a6589c1c..319ff7e53efb 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ +#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ +#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ +#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ +#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ +#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ + extern int console_printk[]; #define console_loglevel (console_printk[0]) @@ -39,13 +50,13 @@ extern int console_printk[]; static inline void console_silent(void) { - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; } static inline void console_verbose(void) { if (console_loglevel) - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } struct va_format { diff --git a/init/main.c b/init/main.c index e08c0b2065a1..04fab8d74c89 100644 --- a/init/main.c +++ b/init/main.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } static int __init quiet_kernel(char *str) { - console_loglevel = 4; + console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 923c5d4e4202..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,18 +54,11 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; /* Deferred messaged from sched code are marked by this special level */ -- cgit v1.2.3 From 6516a466193fe7f72644d65467fb9905139228c3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:02 -0700 Subject: kernel/compat.c: use sizeof() instead of sizeof Fix 4 checkpatch warnings WARNING: sizeof *tv should be sizeof(*tv) Signed-off-by: Fabian Frederick Cc: "H. Peter Anvin" Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_get_timeval(tv, utv); } @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); int compat_put_timeval(const struct timeval *tv, void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_put_timeval(tv, utv); } @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); int compat_get_timespec(struct timespec *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_get_timespec(ts, uts); } @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); int compat_put_timespec(const struct timespec *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_put_timespec(ts, uts); } -- cgit v1.2.3 From 4e52365f279564cef0ddd41db5237f0471381093 Mon Sep 17 00:00:00 2001 From: Matthew Dempsky Date: Fri, 6 Jun 2014 14:36:42 -0700 Subject: ptrace: fix fork event messages across pid namespaces When tracing a process in another pid namespace, it's important for fork event messages to contain the child's pid as seen from the tracer's pid namespace, not the parent's. Otherwise, the tracer won't be able to correlate the fork event with later SIGTRAP signals it receives from the child. We still risk a race condition if a ptracer from a different pid namespace attaches after we compute the pid_t value. However, sending a bogus fork event message in this unlikely scenario is still a vast improvement over the status quo where we always send bogus fork event messages to debuggers in a different pid namespace than the forking process. Signed-off-by: Matthew Dempsky Acked-by: Oleg Nesterov Cc: Kees Cook Cc: Julien Tinnes Cc: Roland McGrath Cc: Jan Kratochvil Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 32 ++++++++++++++++++++++++++++++++ kernel/fork.c | 10 +++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 07d0df6bf768..077904c8b70d 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -5,6 +5,7 @@ #include /* For struct task_struct. */ #include /* for IS_ERR_VALUE */ #include /* For BUG_ON. */ +#include /* For task_active_pid_ns. */ #include /* @@ -128,6 +129,37 @@ static inline void ptrace_event(int event, unsigned long message) } } +/** + * ptrace_event_pid - possibly stop for a ptrace event notification + * @event: %PTRACE_EVENT_* value to report + * @pid: process identifier for %PTRACE_GETEVENTMSG to return + * + * Check whether @event is enabled and, if so, report @event and @pid + * to the ptrace parent. @pid is reported as the pid_t seen from the + * the ptrace parent's pid namespace. + * + * Called without locks. + */ +static inline void ptrace_event_pid(int event, struct pid *pid) +{ + /* + * FIXME: There's a potential race if a ptracer in a different pid + * namespace than parent attaches between computing message below and + * when we acquire tasklist_lock in ptrace_stop(). If this happens, + * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. + */ + unsigned long message = 0; + struct pid_namespace *ns; + + rcu_read_lock(); + ns = task_active_pid_ns(rcu_dereference(current->parent)); + if (ns) + message = pid_nr_ns(pid, ns); + rcu_read_unlock(); + + ptrace_event(event, message); +} + /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task diff --git a/kernel/fork.c b/kernel/fork.c index 0d53eb0dfb6f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } -- cgit v1.2.3 From 650226bd95817d727c9a388752d72ef292c4a668 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:44 -0700 Subject: ptrace: task_clear_jobctl_trapping()->wake_up_bit() needs mb() __wake_up_bit() checks waitqueue_active() and thus the caller needs mb() as wake_up_bit() documents, fix task_clear_jobctl_trapping(). Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6e600aaa2af4..9b5453ee5f5e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; + smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } -- cgit v1.2.3 From 6114041aa7e79a28cee554450fc4a67442327c7f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:46 -0700 Subject: signals: s/siginitset/sigemptyset/ in do_sigtimedwait() Cosmetic, but siginitset(0) looks a bit strange, sigemptyset() is what do_sigtimedwait() needs. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9b5453ee5f5e..251ca9c9d666 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2855,7 +2855,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); - siginitset(&tsk->real_blocked, 0); + sigemptyset(&tsk->real_blocked); sig = dequeue_signal(tsk, &mask, info); } spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From 9490592f2762320ac220f50d4459d9ea7bb797c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:48 -0700 Subject: signals: kill rm_from_queue(), change prepare_signal() to use for_each_thread() rm_from_queue() doesn't make sense. The only caller, prepare_signal(), can use rm_from_queue_full() with the same effect. While at it, change prepare_signal() to use for_each_thread() instead of do/while_each_thread. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 43 ++++++++++--------------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 251ca9c9d666..19316dc3eb0b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -728,29 +728,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) } return 1; } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} static inline int is_si_special(const struct siginfo *info) { @@ -862,6 +839,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; + sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { if (signal->flags & SIGNAL_GROUP_COREDUMP) @@ -873,26 +851,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * This is a stop signal. Remove SIGCONT from all queues. */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + siginitset(&flush, sigmask(SIGCONT)); + rm_from_queue_full(&flush, &signal->shared_pending); + for_each_thread(p, t) + rm_from_queue_full(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { + siginitset(&flush, SIG_KERNEL_STOP_MASK); + rm_from_queue_full(&flush, &signal->shared_pending); + for_each_thread(p, t) { + rm_from_queue_full(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); else ptrace_trap_notify(t); - } while_each_thread(p, t); + } /* * Notify the parent with CLD_CONTINUED if we were stopped. -- cgit v1.2.3 From c09c144139f227de13330111adcafa1659ebd008 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:50 -0700 Subject: signals: rename rm_from_queue_full() to flush_sigqueue_mask() "rm_from_queue_full" looks ugly and misleading, especially now that rm_from_queue() has gone away. Rename it to flush_sigqueue_mask(), this matches flush_sigqueue() we already have. Also remove the obsolete comment which explains the difference with rm_from_queue() we already killed. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 19316dc3eb0b..c088011f873c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -706,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * Returns 1 if any signals were found. * * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; @@ -852,18 +849,18 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); - rm_from_queue_full(&flush, &signal->shared_pending); + flush_sigqueue_mask(&flush, &signal->shared_pending); for_each_thread(p, t) - rm_from_queue_full(&flush, &t->pending); + flush_sigqueue_mask(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); - rm_from_queue_full(&flush, &signal->shared_pending); + flush_sigqueue_mask(&flush, &signal->shared_pending); for_each_thread(p, t) { - rm_from_queue_full(&flush, &t->pending); + flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); @@ -3102,9 +3099,9 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (sig_handler_ignored(sig_handler(t, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); + flush_sigqueue_mask(&mask, &t->signal->shared_pending); do { - rm_from_queue_full(&mask, &t->pending); + flush_sigqueue_mask(&mask, &t->pending); } while_each_thread(current, t); } } @@ -3113,7 +3110,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -static int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; -- cgit v1.2.3 From afe2b0386ac22b6a189a2067b25282cade3fbb4d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:51 -0700 Subject: signals: cleanup the usage of t/current in do_sigaction() The usage of "task_struct *t" and "current" in do_sigaction() looks really annoying and chaotic. Initially "t" is used as a cached value of current but not consistently, then it is reused as a loop variable and we have to use "current" again. Clean up this mess and also convert the code to use for_each_thread(). Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c088011f873c..a6d8c3af0ad6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3068,16 +3068,16 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { - struct task_struct *t = current; + struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; - k = &t->sighand->action[sig-1]; + k = &p->sighand->action[sig-1]; - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&p->sighand->siglock); if (oact) *oact = *k; @@ -3096,17 +3096,16 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { + if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, &t->signal->shared_pending); - do { + flush_sigqueue_mask(&mask, &p->signal->shared_pending); + for_each_thread(p, t) flush_sigqueue_mask(&mask, &t->pending); - } while_each_thread(current, t); } } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&p->sighand->siglock); return 0; } -- cgit v1.2.3 From 0341729b4b832e753c5e745c6ba0e797f6198be0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:53 -0700 Subject: signals: mv {dis,}allow_signal() from sched.h/exit.c to signal.[ch] Move the declaration/definition of allow_signal/disallow_signal to signal.h/signal.c. The new place is more logical and allows to use the static helpers in signal.c (see the next changes). While at it, make them return void and remove the valid_signal() check. Nobody checks the returned value, and in-kernel users must not pass the wrong signal number. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 --- include/linux/signal.h | 2 ++ kernel/exit.c | 39 --------------------------------------- kernel/signal.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fcd0e6098d9..ea74596014a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2414,9 +2414,6 @@ extern void flush_itimer_signals(void); extern void do_group_exit(int); -extern int allow_signal(int); -extern int disallow_signal(int); - extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); diff --git a/include/linux/signal.h b/include/linux/signal.h index ae744c314630..ac83c593f4b9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -284,6 +284,8 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); +extern void allow_signal(int); +extern void disallow_signal(int); /* * Eventually that'll replace get_signal_to_deliver(); macro for now, diff --git a/kernel/exit.c b/kernel/exit.c index 750c2e594617..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -313,45 +313,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. diff --git a/kernel/signal.c b/kernel/signal.c index a6d8c3af0ad6..7d6ff8b18509 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3066,6 +3066,35 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, } #endif +/* + * Let kernel threads use this to say that they allow a certain signal. + * Must not be used if kthread was cloned with CLONE_SIGHAND. + */ +void allow_signal(int sig) +{ + spin_lock_irq(¤t->sighand->siglock); + /* This is only needed for daemonize()'ed kthreads */ + sigdelset(¤t->blocked, sig); + /* + * Kernel threads handle their own signals. Let the signal code + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(allow_signal); + +void disallow_signal(int sig) +{ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(disallow_signal); + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; -- cgit v1.2.3 From ec5955b8fdc1b0bc62495e13869769be732cc6c3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:57 -0700 Subject: signals: kill the obsolete sigdelset() and recalc_sigpending() in allow_signal() allow_signal() does sigdelset(current->blocked) due to historic reason, previously it could be called by a daemonize()'ed kthread, and daemonize() played with current->blocked. Now that daemonize() has gone away we can remove sigdelset() and recalc_sigpending(). If a user really wants to unblock a signal, it must use sigprocmask() or set_current_block() explicitely. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7d6ff8b18509..c64c891ca0a6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3072,16 +3072,13 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, */ void allow_signal(int sig) { - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ + spin_lock_irq(¤t->sighand->siglock); current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } EXPORT_SYMBOL(allow_signal); -- cgit v1.2.3 From 580d34e42ad621736a3c53c9c11a2152c18a4068 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:58 -0700 Subject: signals: disallow_signal() should flush the potentially pending signal disallow_signal() simply sets SIG_IGN, this is not enough and recalc_sigpending() is simply pointless because in can never change the state of TIF_SIGPENDING. If we ignore a signal, we also need to do flush_sigqueue_mask() for the case when this signal is pending, this way recalc_sigpending() can actually clear TIF_SIGPENDING and we do not "leak" the allocated siginfo's. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c64c891ca0a6..3ec405132c79 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3085,8 +3085,15 @@ EXPORT_SYMBOL(allow_signal); void disallow_signal(int sig) { + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + spin_lock_irq(¤t->sighand->siglock); current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } -- cgit v1.2.3 From b4e74264eb0b03f42097fa70a0766312156244a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:37:00 -0700 Subject: signals: introduce kernel_sigaction() Now that allow_signal() is really trivial we can unify it with disallow_signal(). Add the new helper, kernel_sigaction(), and reimplement allow_signal/disallow_signal as a trivial wrappers. This saves one EXPORT_SYMBOL() and the new helper can have more users. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 18 ++++++++++++++++-- kernel/signal.c | 36 ++++++++++++------------------------ 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index ac83c593f4b9..c9e65360c49a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -284,8 +284,22 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); -extern void allow_signal(int); -extern void disallow_signal(int); +extern void kernel_sigaction(int, __sighandler_t); + +static inline void allow_signal(int sig) +{ + /* + * Kernel threads handle their own signals. Let the signal code + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ + kernel_sigaction(sig, (__force __sighandler_t)2); +} + +static inline void disallow_signal(int sig) +{ + kernel_sigaction(sig, SIG_IGN); +} /* * Eventually that'll replace get_signal_to_deliver(); macro for now, diff --git a/kernel/signal.c b/kernel/signal.c index 3ec405132c79..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3067,37 +3067,25 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, #endif /* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. + * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ -void allow_signal(int sig) +void kernel_sigaction(int sig, __sighandler_t action) { - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - spin_unlock_irq(¤t->sighand->siglock); -} -EXPORT_SYMBOL(allow_signal); + current->sighand->action[sig - 1].sa.sa_handler = action; + if (action == SIG_IGN) { + sigset_t mask; -void disallow_signal(int sig) -{ - sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); - sigemptyset(&mask); - sigaddset(&mask, sig); - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); - flush_sigqueue_mask(&mask, ¤t->pending); - recalc_sigpending(); + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); + recalc_sigpending(); + } spin_unlock_irq(¤t->sighand->siglock); } -EXPORT_SYMBOL(disallow_signal); +EXPORT_SYMBOL(kernel_sigaction); int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { -- cgit v1.2.3 From 76e0a6f40b198f08fca69221c3d0112d1e9713a7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:37:02 -0700 Subject: signals: change wait_for_helper() to use kernel_sigaction() Now that we have kernel_sigaction() we can change wait_for_helper() to use it and cleans up the code a bit. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 0ac67a5861c5..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -285,10 +285,7 @@ static int wait_for_helper(void *data) pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; - spin_unlock_irq(¤t->sighand->siglock); - + kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; -- cgit v1.2.3 From a219ccf4637396a2392bfbec7c12acbfe2b06b46 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 6 Jun 2014 14:37:05 -0700 Subject: smp: print more useful debug info upon receiving IPI on an offline CPU There is a longstanding problem related to CPU hotplug which causes IPIs to be delivered to offline CPUs, and the smp-call-function IPI handler code prints out a warning whenever this is detected. Every once in a while this (usually harmless) warning gets reported on LKML, but so far it has not been completely fixed. Usually the solution involves finding out the IPI sender and fixing it by adding appropriate synchronization with CPU hotplug. However, while going through one such internal bug reports, I found that there is a significant bug in the receiver side itself (more specifically, in stop-machine) that can lead to this problem even when the sender code is perfectly fine. This patchset fixes that synchronization problem in the CPU hotplug stop-machine code. Patch 1 adds some additional debug code to the smp-call-function framework, to help debug such issues easily. Patch 2 modifies the stop-machine code to ensure that any IPIs that were sent while the target CPU was online, would be noticed and handled by that CPU without fail before it goes offline. Thus, this avoids scenarios where IPIs are received on offline CPUs (as long as the sender uses proper hotplug synchronization). In fact, I debugged the problem by using Patch 1, and found that the payload of the IPI was always the block layer's trigger_softirq() function. But I was not able to find anything wrong with the block layer code. That's when I started looking at the stop-machine code and realized that there is a race-window which makes the IPI _receiver_ the culprit, not the sender. Patch 2 fixes that race and hence this should put an end to most of the hard-to-debug IPI-to-offline-CPU issues. This patch (of 2): Today the smp-call-function code just prints a warning if we get an IPI on an offline CPU. This info is sufficient to let us know that something went wrong, but often it is very hard to debug exactly who sent the IPI and why, from this info alone. In most cases, we get the warning about the IPI to an offline CPU, immediately after the CPU going offline comes out of the stop-machine phase and reenables interrupts. Since all online CPUs participate in stop-machine, the information regarding the sender of the IPI is already lost by the time we exit the stop-machine loop. So even if we dump the stack on each CPU at this point, we won't find anything useful since all of them will show the stack-trace of the stopper thread. So we need a better way to figure out who sent the IPI and why. To achieve this, when we detect an IPI targeted to an offline CPU, loop through the call-single-data linked list and print out the payload (i.e., the name of the function which was supposed to be executed by the target CPU). This would give us an insight as to who might have sent the IPI and help us debug this further. [akpm@linux-foundation.org: correctly suppress warning output on second and later occurrences] Signed-off-by: Srivatsa S. Bhat Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Tejun Heo Cc: Rusty Russell Cc: Frederic Weisbecker Cc: Christoph Hellwig Cc: Mel Gorman Cc: Rik van Riel Cc: Borislav Petkov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Gautham R Shenoy Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void) { struct llist_node *entry; struct call_single_data *csd, *csd_next; + static bool warned; + + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ - WARN_ON_ONCE(!cpu_online(smp_processor_id())); + if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + warned = true; + WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); - entry = llist_del_all(&__get_cpu_var(call_single_queue)); - entry = llist_reverse_order(entry); + /* + * We don't have to use the _safe() variant here + * because we are not invoking the IPI handlers yet. + */ + llist_for_each_entry(csd, entry, llist) + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + } llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); -- cgit v1.2.3 From f06e5153f4ae2e2f3b0300f0e260e40cb7fefd45 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Jun 2014 14:37:07 -0700 Subject: kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers Add a "crash_kexec_post_notifiers" boot option to run kdump after running panic_notifiers and dump kmsg. This can help rare situations where kdump fails because of unstable crashed kernel or hardware failure (memory corruption on critical data/code), or the 2nd kernel is already broken by the 1st kernel (it's a broken behavior, but who can guarantee that the "crashed" kernel works correctly?). Usage: add "crash_kexec_post_notifiers" to kernel boot option. Note that this actually increases risks of the failure of kdump. This option should be set only if you worry about the rare case of kdump failure rather than increasing the chance of success. Signed-off-by: Masami Hiramatsu Acked-by: Motohiro Kosaki Acked-by: Vivek Goyal Cc: Eric Biederman Cc: Yoshihiro YUNOMAE Cc: Satoru MORIYA Cc: Tomoki Sekiyama Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 8 ++++++++ kernel/panic.c | 23 +++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9973a7e2e0ac..b9f67781c577 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2361,6 +2361,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. timeout < 0: reboot immediately Format: + crash_kexec_post_notifiers + Run kdump after running panic-notifiers and dumping + kmsg. This only for the users who doubt kdump always + succeeds in any situation. + Note that this also increases risks of kdump failure, + because some panic notifiers can make the crashed + kernel more unstable. + parkbd.port= [HW] Parallel port number the keyboard adapter is connected to, default is 0. Format: diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); +static bool crash_kexec_post_notifiers; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -112,9 +113,11 @@ void panic(const char *fmt, ...) /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. - * Do we want to call this before we try to display a message? + * If we want to run this after calling panic_notifiers, pass + * the "crash_kexec_post_notifiers" option to the kernel. */ - crash_kexec(NULL); + if (!crash_kexec_post_notifiers) + crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -131,6 +134,15 @@ void panic(const char *fmt, ...) kmsg_dump(KMSG_DUMP_PANIC); + /* + * If you doubt kdump always works fine in any situation, + * "crash_kexec_post_notifiers" offers you a chance to run + * panic_notifiers and dumping kmsg before kdump. + * Note: since some panic_notifiers can make crashed kernel + * more unstable, it can increase risks of the kdump failure too. + */ + crash_kexec(NULL); + bust_spinlocks(0); if (!panic_blink) @@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +static int __init setup_crash_kexec_post_notifiers(char *s) +{ + crash_kexec_post_notifiers = true; + return 0; +} +early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); + static int __init oops_setup(char *s) { if (!s) -- cgit v1.2.3 From e1bebcf41ed0aa15f11cec186cbd5141730bcafc Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:09 -0700 Subject: kernel/kexec.c: convert printk to pr_foo() + some pr_warning -> pr_warn and checkpatch warning fixes Signed-off-by: Fabian Frederick Cc: Eric Biederman Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 69 +++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 28c57069ef68..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { size_t segment_bytes; struct kimage *image; @@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free; } @@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } @@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { @@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image) * done with it. */ ind = entry; - } - else if (entry & IND_SOURCE) + } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ @@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image, addr = old_addr; page = old_page; break; - } - else { + } else { /* Place the page on the destination list I * will use it later. */ @@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EINVAL; ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) return -EFAULT; @@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + &prstatus, sizeof(prstatus)); final_note(buf); } @@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void) /* Allocate memory for saving cpu registers. */ crash_notes = alloc_percpu(note_buf_t); if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; @@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init); * * The function returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline, *tmp; @@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline, /* get the start of the range */ start = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); + pr_warn("crashkernel: '-' expected\n"); return -EINVAL; } cur++; @@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline, if (*cur != ':') { end = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (end <= start) { - pr_warning("crashkernel: end <= start\n"); + pr_warn("crashkernel: end <= start\n"); return -EINVAL; } } if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); + pr_warn("crashkernel: ':' expected\n"); return -EINVAL; } cur++; size = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected\n"); + pr_warn("Memory value expected\n"); return -EINVAL; } cur = tmp; if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); + pr_warn("crashkernel: invalid size\n"); return -EINVAL; } @@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); + pr_warn("Memory value expected after '@'\n"); return -EINVAL; } } @@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline, /* * That function parses "simple" (old) crashkernel command lines like * - * crashkernel=size[@offset] + * crashkernel=size[@offset] * * It returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline; *crash_size = memparse(cmdline, &cur); if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); + pr_warn("crashkernel: memory value expected\n"); return -EINVAL; } if (*cur == '@') *crash_base = memparse(cur+1, &cur); else if (*cur != ' ' && *cur != '\0') { - pr_warning("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char\n"); return -EINVAL; } @@ -1691,7 +1686,7 @@ int kernel_kexec(void) * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); - printk(KERN_EMERG "Starting new kernel\n"); + pr_emerg("Starting new kernel\n"); machine_shutdown(); } -- cgit v1.2.3 From f88083005ab319abba5d0b2e4e997558245493c8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:17 -0700 Subject: sysctl: clean up char buffer arguments When writing to a sysctl string, each write, regardless of VFS position, began writing the string from the start. This meant the contents of the last write to the sysctl controlled the string contents instead of the first. This misbehavior was featured in an exploit against Chrome OS. While it's not in itself a vulnerability, it's a weirdness that isn't on the mind of most auditors: "This filter looks correct, the first line written would not be meaningful to sysctl" doesn't apply here, since the size of the write and the contents of the final write are what matter when writing to sysctls. This adds the sysctl kernel.sysctl_writes_strict to control the write behavior. The default (0) reports when VFS position is non-0 on a write, but retains legacy behavior, -1 disables the warning, and 1 enables the position-respecting behavior. The long-term plan here is to wait for userspace to be fixed in response to the new warning and to then switch the default kernel behavior to the new position-respecting behavior. This patch (of 4): The char buffer arguments are needlessly cast in weird places. Clean it up so things are easier to read. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40ce2d983b12..3e214beabbe9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1703,8 +1703,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1730,7 +1730,7 @@ static int _proc_do_string(void* data, int maxlen, int write, len = maxlen-1; if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) data)[len] = 0; + data[len] = 0; *ppos += *lenp; } else { len = strlen(data); @@ -1748,10 +1748,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1781,8 +1781,8 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) -- cgit v1.2.3 From 2ca9bb456ada8bcbdc8f77f8fc78207653bbaa92 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:18 -0700 Subject: sysctl: refactor sysctl string writing logic Consolidate buffer length checking with new-line/end-of-line checking. Additionally, instead of reading user memory twice, just do the assignment during the loop. This change doesn't affect the potential races here. It was already possible to read a sysctl that was in the middle of a write. In both cases, the string will always be NULL terminated. The pre-existing race remains a problem to be solved. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e214beabbe9..ac6847feaa83 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1717,21 +1717,18 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { + /* Start writing from beginning of buffer. */ len = 0; + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; data[len] = 0; - *ppos += *lenp; } else { len = strlen(data); if (len > maxlen) -- cgit v1.2.3 From f4aacea2f5d1a5f7e3154e967d70cf3f711bcd61 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:19 -0700 Subject: sysctl: allow for strict write position handling When writing to a sysctl string, each write, regardless of VFS position, begins writing the string from the start. This means the contents of the last write to the sysctl controls the string contents instead of the first: open("/proc/sys/kernel/modprobe", O_WRONLY) = 1 write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 4096) = 4096 write(1, "/bin/true", 9) = 9 close(1) = 0 $ cat /proc/sys/kernel/modprobe /bin/true Expected behaviour would be to have the sysctl be "AAAA..." capped at maxlen (in this case KMOD_PATH_LEN: 256), instead of truncating to the contents of the second write. Similarly, multiple short writes would not append to the sysctl. The old behavior is unlike regular POSIX files enough that doing audits of software that interact with sysctls can end up in unexpected or dangerous situations. For example, "as long as the input starts with a trusted path" turns out to be an insufficient filter, as what must also happen is for the input to be entirely contained in a single write syscall -- not a common consideration, especially for high level tools. This provides kernel.sysctl_writes_strict as a way to make this behavior act in a less surprising manner for strings, and disallows non-zero file position when writing numeric sysctls (similar to what is already done when reading from non-zero file positions). For now, the default (0) is to warn about non-zero file position use, but retain the legacy behavior. Setting this to -1 disables the warning, and setting this to 1 enables the file position respecting behavior. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: move misplaced hunk, per Randy] Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 21 +++++++++++++ kernel/sysctl.c | 69 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9886c3d57fc2..708bb7f1b7e0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -77,6 +77,7 @@ show up in /proc/sys/kernel: - shmmni - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt +- sysctl_writes_strict - tainted - threads-max - unknown_nmi_panic @@ -762,6 +763,26 @@ without users and with a dead originative process will be destroyed. ============================================================== +sysctl_writes_strict: + +Control how file position affects the behavior of updating sysctl values +via the /proc/sys interface: + + -1 - Legacy per-write sysctl value handling, with no printk warnings. + Each write syscall must fully contain the sysctl value to be + written, and multiple writes on the same sysctl file descriptor + will rewrite the sysctl value, regardless of file position. + 0 - (default) Same behavior as above, but warn about processes that + perform writes to a sysctl file descriptor when the file position + is not 0. + 1 - Respect file position when writing sysctl strings. Multiple writes + will append to the sysctl value buffer. Anything past the max length + of the sysctl value buffer will be ignored. Writes to numeric sysctl + entries must always be at file position 0 and the value must be + fully contained in the buffer sent in the write syscall. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac6847feaa83..7a910b9081e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -1717,8 +1733,20 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { - /* Start writing from beginning of buffer. */ - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { @@ -1758,6 +1786,14 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1778,6 +1814,9 @@ static int _proc_do_string(char *data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); } @@ -1953,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2010,6 +2061,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2202,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2257,6 +2321,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } -- cgit v1.2.3 From 68a9a435e4efb63c49a0c0a25756e3b71a5634ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:21 -0700 Subject: kernel/user_namespace.c: kernel-doc/checkpatch fixes -uid->gid -split some function declarations -if/then/else warning Signed-off-by: Fabian Frederick Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bf71b4b2d632..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in - * @uid: group identifier + * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) return 0; } -static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +static void *m_start(struct seq_file *seq, loff_t *ppos, + struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; -static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +static bool mappings_overlap(struct uid_gid_map *new_map, + struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; @@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EINVAL; pos = kbuf; new_map.nr_extents = 0; - for (;pos; pos = next_line) { + for (; pos; pos = next_line) { extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ @@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Verify we have been given valid starting values */ if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1 )) + (extent->lower_first == (u32) -1)) goto out; - /* Verify count is not zero and does not cause the extent to wrap */ + /* Verify count is not zero and does not cause the + * extent to wrap + */ if ((extent->first + extent->count) <= extent->first) goto out; - if ((extent->lower_first + extent->count) <= extent->lower_first) + if ((extent->lower_first + extent->count) <= + extent->lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ @@ -751,7 +756,8 @@ out: return ret; } -ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz &ns->uid_map, &ns->parent->uid_map); } -ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } -ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(const struct file *file, +static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { @@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, file->f_cred->fsuid)) return true; - } - else if (cap_setid == CAP_SETGID) { + } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (gid_eq(gid, file->f_cred->fsgid)) return true; -- cgit v1.2.3 From aba871f1e9a8dad07d442913998cd679529f2784 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:29 -0700 Subject: kernel/profile.c: convert printk to pr_foo() Signed-off-by: Fabian Frederick Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..c1204e2af5b2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -64,12 +64,10 @@ int profile_setup(char *str) str += strlen(sleepstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", + pr_info("kernel sleep profiling enabled (shift: %ld)\n", prof_shift); #else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); + pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; @@ -77,8 +75,7 @@ int profile_setup(char *str) str += strlen(schedstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", + pr_info("kernel schedule profiling enabled (shift: %ld)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; @@ -86,13 +83,12 @@ int profile_setup(char *str) str += strlen(kvmstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", + pr_info("kernel KVM profiling enabled (shift: %ld)\n", prof_shift); } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %ld)\n", prof_shift); } return 1; -- cgit v1.2.3 From f3da64d1ea12ed6cfe18e6dedbc9739743e0b884 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:30 -0700 Subject: kernel/profile.c: use static const char instead of static char schedstr, sleepstr and kvmstr are only used in strcmp & strlen Signed-off-by: Fabian Frederick Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index c1204e2af5b2..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); int profile_setup(char *str) { - static char schedstr[] = "schedule"; - static char sleepstr[] = "sleep"; - static char kvmstr[] = "kvm"; + static const char schedstr[] = "schedule"; + static const char sleepstr[] = "sleep"; + static const char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { -- cgit v1.2.3 From 7153e402731c3e72331633d1ac15a654768aecac Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Fri, 6 Jun 2014 14:37:37 -0700 Subject: ipc, kernel: use Linux headers Use #include instead of Use #include instead of Signed-off-by: Paul McQuade Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/compat.c | 2 +- ipc/compat_mq.c | 2 +- ipc/msg.c | 2 +- ipc/sem.c | 2 +- ipc/shm.c | 2 +- kernel/acct.c | 2 +- kernel/audit.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/ipc/compat.c b/ipc/compat.c index 45d035d4cedc..b5ef4f7946dc 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include "util.h" diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c index 90d29f59cac6..ef6f91cc4490 100644 --- a/ipc/compat_mq.c +++ b/ipc/compat_mq.c @@ -12,7 +12,7 @@ #include #include -#include +#include struct compat_mq_attr { compat_long_t mq_flags; /* message queue flags */ diff --git a/ipc/msg.c b/ipc/msg.c index 35e4018de53c..d608e6dde919 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include "util.h" /* diff --git a/ipc/sem.c b/ipc/sem.c index 3fcbc96abee9..e8dcc72d5bef 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -87,7 +87,7 @@ #include #include -#include +#include #include "util.h" /* One semaphore structure for each semaphore in the system. */ diff --git a/ipc/shm.c b/ipc/shm.c index b54c93f6d117..fe49fdd240a8 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include "util.h" diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..df2851905d14 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include /* sector_div */ #include diff --git a/kernel/audit.c b/kernel/audit.c index 47845c57eb19..f30106459a32 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -44,7 +44,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include -- cgit v1.2.3 From 46c0a8ca3e841b14a1d981e2116eaf2d1c7f2235 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Fri, 6 Jun 2014 14:37:37 -0700 Subject: ipc, kernel: clear whitespace trailing whitespace Signed-off-by: Paul McQuade Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 31 +++++++++++++++---------------- ipc/sem.c | 10 +++++----- ipc/shm.c | 2 +- ipc/util.c | 4 ++-- ipc/util.h | 8 ++++---- kernel/acct.c | 4 ++-- 6 files changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/ipc/msg.c b/ipc/msg.c index d608e6dde919..7ed1ef338e76 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -611,23 +611,22 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) static int testmsg(struct msg_msg *msg, long type, int mode) { - switch (mode) - { - case SEARCH_ANY: - case SEARCH_NUMBER: + switch (mode) { + case SEARCH_ANY: + case SEARCH_NUMBER: + return 1; + case SEARCH_LESSEQUAL: + if (msg->m_type <= type) return 1; - case SEARCH_LESSEQUAL: - if (msg->m_type <= type) - return 1; - break; - case SEARCH_EQUAL: - if (msg->m_type == type) - return 1; - break; - case SEARCH_NOTEQUAL: - if (msg->m_type != type) - return 1; - break; + break; + case SEARCH_EQUAL: + if (msg->m_type == type) + return 1; + break; + case SEARCH_NOTEQUAL: + if (msg->m_type != type) + return 1; + break; } return 0; } diff --git a/ipc/sem.c b/ipc/sem.c index e8dcc72d5bef..fe0928a3d08b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -160,7 +160,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * sem_array.pending{_alter,_cont}, * sem_array.sem_undo: global sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. - * + * * sem_array.sem_base[i].pending_{const,alter}: * global or semaphore sem_lock() for read/write */ @@ -1161,7 +1161,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, err = security_sem_semctl(NULL, cmd); if (err) return err; - + memset(&seminfo, 0, sizeof(seminfo)); seminfo.semmni = ns->sc_semmni; seminfo.semmns = ns->sc_semmns; @@ -1181,7 +1181,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, } max_id = ipc_get_maxid(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); - if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) + if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; } @@ -1883,7 +1883,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, /* We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ - + queue.sops = sops; queue.nsops = nsops; queue.undo = un; @@ -2016,7 +2016,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) return error; atomic_inc(&undo_list->refcnt); tsk->sysvsem.undo_list = undo_list; - } else + } else tsk->sysvsem.undo_list = NULL; return 0; diff --git a/ipc/shm.c b/ipc/shm.c index fe49fdd240a8..2b64b0d25bba 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -694,7 +694,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf out.shmmin = in->shmmin; out.shmmni = in->shmmni; out.shmseg = in->shmseg; - out.shmall = in->shmall; + out.shmall = in->shmall; return copy_to_user(buf, &out, sizeof(out)); } diff --git a/ipc/util.c b/ipc/util.c index 9b3fa38afe2c..27d74e69fd57 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -183,7 +183,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find - * + * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * @@ -538,7 +538,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ - if ((requested_mode & ~granted_mode & 0007) && + if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; diff --git a/ipc/util.h b/ipc/util.h index e1153ad574b7..1a5a0fcd099c 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -78,9 +78,9 @@ struct ipc_params { * . routine to call for an extra check if needed */ struct ipc_ops { - int (*getnew) (struct ipc_namespace *, struct ipc_params *); - int (*associate) (struct kern_ipc_perm *, int); - int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *); + int (*getnew)(struct ipc_namespace *, struct ipc_params *); + int (*associate)(struct kern_ipc_perm *, int); + int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *); }; struct seq_file; @@ -142,7 +142,7 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION - /* On IA-64, we always use the "64-bit version" of the IPC structures. */ +/* On IA-64, we always use the "64-bit version" of the IPC structures. */ # define ipc_parse_version(cmd) IPC_64 #else int ipc_parse_version(int *cmd); diff --git a/kernel/acct.c b/kernel/acct.c index df2851905d14..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_lock(&acct_lock); if (file != acct->file) { if (act) - res = act>0; + res = act > 0; goto out; } @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); + return PTR_ERR(tmp); error = acct_on(tmp); putname(tmp); } else { -- cgit v1.2.3 From 119ce5c8b99cd6b874cec58c81ce9b56e52160c3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:53 -0700 Subject: kernel/seccomp.c: kernel-doc warning fix + fix small typo Signed-off-by: Fabian Frederick Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..f6d76bebe69f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,7 +39,7 @@ * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter * @len: the number of instructions in the program - * @insns: the BPF program instructions to evaluate + * @insnsi: the BPF program instructions to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -220,7 +220,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) return -ENOMEM; /* - * Installing a seccomp filter requires that the task have + * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. -- cgit v1.2.3 From 6f8fd1d77e64dd6be1075041cdfd8de4c2ca4e2b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:08 -0700 Subject: sysctl: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- kernel/utsname_sysctl.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7a910b9081e8..db19e3e2aa4b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -202,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 6fbe811c7ad1..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,7 +44,7 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; -- cgit v1.2.3