From a2f5630cb737787c1bfd9aa894b1bf9f3f4554ea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 17:47:16 -0400
Subject: percpu_ref: remove unnecessary RCU grace period for staggered atomic
 switching confirmation

At the beginning, percpu_ref guaranteed a RCU grace period between a
call to percpu_ref_kill_and_confirm() and the invocation of the
confirmation callback.  This guarantee exposed internal implementation
details and got rescinded while switching over to sched RCU; however,
__percpu_ref_switch_to_atomic() still inserts a full sched RCU grace
period even when it can simply wait for the previous attempt.

Remove the unnecessary grace period and perform the confirmation
synchronously for staggered atomic switching attempts.  Update
comments accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 27fe74948882..8ade009ca2c9 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -177,17 +177,11 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 		call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
 	} else if (confirm_switch) {
 		/*
-		 * Somebody already set ATOMIC.  Switching may still be in
-		 * progress.  @confirm_switch must be invoked after the
-		 * switching is complete and a full sched RCU grace period
-		 * has passed.  Wait synchronously for the previous
-		 * switching and schedule @confirm_switch invocation.
+		 * Somebody else already set ATOMIC.  Wait for its
+		 * completion and invoke @confirm_switch() directly.
 		 */
 		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-		ref->confirm_switch = confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+		confirm_switch(ref);
 	}
 }
 
@@ -211,10 +205,6 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
  * but it may block if @confirm_kill is specified and @ref is already in
  * the process of switching to atomic mode.  In such cases, @confirm_switch
  * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
  */
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch)
@@ -290,11 +280,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  *
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
-- 
cgit v1.2.3


From b2302c7fdc654d249c546aac6228b8e10969bc1e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 17:47:17 -0400
Subject: percpu_ref: reorganize __percpu_ref_switch_to_atomic() and relocate
 percpu_ref_switch_to_atomic()

Reorganize __percpu_ref_switch_to_atomic() so that it looks
structurally similar to __percpu_ref_switch_to_percpu() and relocate
percpu_ref_switch_to_atomic so that the two internal functions are
co-located.

This patch doesn't introduce any functional differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 98 ++++++++++++++++++++++++++-------------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 8ade009ca2c9..599a78c0633b 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -161,56 +161,30 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 					  percpu_ref_func_t *confirm_switch)
 {
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
-		/* switching from percpu to atomic */
-		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
-		/*
-		 * Non-NULL ->confirm_switch is used to indicate that
-		 * switching is in progress.  Use noop one if unspecified.
-		 */
-		WARN_ON_ONCE(ref->confirm_switch);
-		ref->confirm_switch =
-			confirm_switch ?: percpu_ref_noop_confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
-	} else if (confirm_switch) {
-		/*
-		 * Somebody else already set ATOMIC.  Wait for its
-		 * completion and invoke @confirm_switch() directly.
-		 */
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-		confirm_switch(ref);
+	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+		if (confirm_switch) {
+			/*
+			 * Somebody else already set ATOMIC.  Wait for its
+			 * completion and invoke @confirm_switch() directly.
+			 */
+			wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+			confirm_switch(ref);
+		}
+		return;
 	}
-}
 
-/**
- * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- * @confirm_switch: optional confirmation callback
- *
- * There's no reason to use this function for the usual reference counting.
- * Use percpu_ref_kill[_and_confirm]().
- *
- * Schedule switching of @ref to atomic mode.  All its percpu counts will
- * be collected to the main atomic counter.  On completion, when all CPUs
- * are guaraneed to be in atomic mode, @confirm_switch, which may not
- * block, is invoked.  This function may be invoked concurrently with all
- * the get/put operations and can safely be mixed with kill and reinit
- * operations.  Note that @ref will stay in atomic mode across kill/reinit
- * cycles until percpu_ref_switch_to_percpu() is called.
- *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode.  In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- */
-void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-				 percpu_ref_func_t *confirm_switch)
-{
-	ref->force_atomic = true;
-	__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	/* switching from percpu to atomic */
+	ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+	/*
+	 * Non-NULL ->confirm_switch is used to indicate that switching is
+	 * in progress.  Use noop one if unspecified.
+	 */
+	WARN_ON_ONCE(ref->confirm_switch);
+	ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+	percpu_ref_get(ref);	/* put after confirmation */
+	call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
 }
 
 static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
@@ -240,6 +214,34 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
 }
 
+/**
+ * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ * @confirm_switch: optional confirmation callback
+ *
+ * There's no reason to use this function for the usual reference counting.
+ * Use percpu_ref_kill[_and_confirm]().
+ *
+ * Schedule switching of @ref to atomic mode.  All its percpu counts will
+ * be collected to the main atomic counter.  On completion, when all CPUs
+ * are guaraneed to be in atomic mode, @confirm_switch, which may not
+ * block, is invoked.  This function may be invoked concurrently with all
+ * the get/put operations and can safely be mixed with kill and reinit
+ * operations.  Note that @ref will stay in atomic mode across kill/reinit
+ * cycles until percpu_ref_switch_to_percpu() is called.
+ *
+ * This function normally doesn't block and can be called from any context
+ * but it may block if @confirm_kill is specified and @ref is already in
+ * the process of switching to atomic mode.  In such cases, @confirm_switch
+ * will be invoked after the switching is complete.
+ */
+void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_switch)
+{
+	ref->force_atomic = true;
+	__percpu_ref_switch_to_atomic(ref, confirm_switch);
+}
+
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
  * @ref: percpu_ref to switch to percpu mode
-- 
cgit v1.2.3


From 18808354b79622ed11857e41f9044ba17aec5b01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 17:47:18 -0400
Subject: percpu_ref: unify staggered atomic switching wait behavior

When an atomic or percpu switching starts before the previous atomic
switching finishes, the taken behaviors are

* If the new atomic switching has confirmation callback, it waits
  for the previous atomic switching to complete.

* If the new percpu switching is the first percpu switching following
  the previous atomic switching, it waits the previous atomic
  switching to complete.

No percpu_ref user depends on these subtleties.  The only meaningful
part is that, if the caller ensures that atomic switching isn't in
progress, mode switching operations can be issued from any context.

This patch pulls the wait logic to the top of both switching functions
so that they always wait for the previous atomic switching to
complete.  This makes the behavior simpler and consistent for both
directions and will help allowing concurrent invocations of mode
switching functions.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 599a78c0633b..c3617a8525d7 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -161,15 +161,19 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 					  percpu_ref_func_t *confirm_switch)
 {
+	/*
+	 * If the previous ATOMIC switching hasn't finished yet, wait for
+	 * its completion.  If the caller ensures that ATOMIC switching
+	 * isn't in progress, this function can be called from any context.
+	 * Do an extra confirm_switch test to circumvent the unconditional
+	 * might_sleep() in wait_event().
+	 */
+	if (ref->confirm_switch)
+		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+
 	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
-		if (confirm_switch) {
-			/*
-			 * Somebody else already set ATOMIC.  Wait for its
-			 * completion and invoke @confirm_switch() directly.
-			 */
-			wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+		if (confirm_switch)
 			confirm_switch(ref);
-		}
 		return;
 	}
 
@@ -180,7 +184,6 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 	 * Non-NULL ->confirm_switch is used to indicate that switching is
 	 * in progress.  Use noop one if unspecified.
 	 */
-	WARN_ON_ONCE(ref->confirm_switch);
 	ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
 
 	percpu_ref_get(ref);	/* put after confirmation */
@@ -192,13 +195,21 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 	int cpu;
 
+	/*
+	 * If the previous ATOMIC switching hasn't finished yet, wait for
+	 * its completion.  If the caller ensures that ATOMIC switching
+	 * isn't in progress, this function can be called from any context.
+	 * Do an extra confirm_switch test to circumvent the unconditional
+	 * might_sleep() in wait_event().
+	 */
+	if (ref->confirm_switch)
+		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+
 	BUG_ON(!percpu_count);
 
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
-	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
-- 
cgit v1.2.3


From 3f49bdd95855a33eea749304d2e10530a869218b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 17:47:19 -0400
Subject: percpu_ref: restructure operation mode switching

Restructure atomic/percpu mode switching.

* The users of __percpu_ref_switch_to_atomic/percpu() now call a new
  function __percpu_ref_switch_mode() which calls either of the
  original switching functions depending on the current state of
  ref->force_atomic and the __PERCPU_REF_DEAD flag.  The callers no
  longer check whether switching is necessary but always invoke
  __percpu_ref_switch_mode().

* !ref->confirm_switch waiting is collected into
  __percpu_ref_switch_mode().

This patch doesn't cause any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 64 +++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index c3617a8525d7..f3ff793691ac 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -161,16 +161,6 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 					  percpu_ref_func_t *confirm_switch)
 {
-	/*
-	 * If the previous ATOMIC switching hasn't finished yet, wait for
-	 * its completion.  If the caller ensures that ATOMIC switching
-	 * isn't in progress, this function can be called from any context.
-	 * Do an extra confirm_switch test to circumvent the unconditional
-	 * might_sleep() in wait_event().
-	 */
-	if (ref->confirm_switch)
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
 	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
 		if (confirm_switch)
 			confirm_switch(ref);
@@ -195,16 +185,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 	int cpu;
 
-	/*
-	 * If the previous ATOMIC switching hasn't finished yet, wait for
-	 * its completion.  If the caller ensures that ATOMIC switching
-	 * isn't in progress, this function can be called from any context.
-	 * Do an extra confirm_switch test to circumvent the unconditional
-	 * might_sleep() in wait_event().
-	 */
-	if (ref->confirm_switch)
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
 	BUG_ON(!percpu_count);
 
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
@@ -225,6 +205,25 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
 }
 
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+				     percpu_ref_func_t *confirm_switch)
+{
+	/*
+	 * If the previous ATOMIC switching hasn't finished yet, wait for
+	 * its completion.  If the caller ensures that ATOMIC switching
+	 * isn't in progress, this function can be called from any context.
+	 * Do an extra confirm_switch test to circumvent the unconditional
+	 * might_sleep() in wait_event().
+	 */
+	if (ref->confirm_switch)
+		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+
+	if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+		__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	else
+		__percpu_ref_switch_to_percpu(ref);
+}
+
 /**
  * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
  * @ref: percpu_ref to switch to atomic mode
@@ -241,16 +240,15 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  * operations.  Note that @ref will stay in atomic mode across kill/reinit
  * cycles until percpu_ref_switch_to_percpu() is called.
  *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode.  In such cases, @confirm_switch
- * will be invoked after the switching is complete.
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
  */
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch)
 {
 	ref->force_atomic = true;
-	__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	__percpu_ref_switch_mode(ref, confirm_switch);
 }
 
 /**
@@ -267,17 +265,14 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
  * dying or dead, the actual switching takes place on the following
  * percpu_ref_reinit().
  *
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
  */
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 {
 	ref->force_atomic = false;
-
-	/* a dying or dead ref can't be switched to percpu mode w/o reinit */
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
-		__percpu_ref_switch_to_percpu(ref);
+	__percpu_ref_switch_mode(ref, NULL);
 }
 
 /**
@@ -302,7 +297,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 		  "%s called more than once on %pf!", __func__, ref->release);
 
 	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
-	__percpu_ref_switch_to_atomic(ref, confirm_kill);
+	__percpu_ref_switch_mode(ref, confirm_kill);
 	percpu_ref_put(ref);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
@@ -324,7 +319,6 @@ void percpu_ref_reinit(struct percpu_ref *ref)
 
 	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
 	percpu_ref_get(ref);
-	if (!ref->force_atomic)
-		__percpu_ref_switch_to_percpu(ref);
+	__percpu_ref_switch_mode(ref, NULL);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_reinit);
-- 
cgit v1.2.3


From 33e465ce7cb30b71c113a26f36d293b545a28e12 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 17:47:20 -0400
Subject: percpu_ref: allow operation mode switching operations to be called
 concurrently

percpu_ref initially didn't have explicit mode switching operations.
It started out in percpu mode and switched to atomic mode on kill and
then released.  Ensuring that kill operation is initiated only after
init completes was naturally the caller's responsibility.

percpu_ref_reinit() was introduced later but it didn't shift the
synchronization responsibility.  Reinit can't be performed until kill
is confirmed, so there was nothing to worry about
synchronization-wise.  Also, as both reinit and kill manipulate the
base reference, invocations of the same function couldn't be allowed
to race each other.

The latest additions of percpu_ref_switch_to_atomic/percpu() changed
the situation.  These two functions can be called any time as long as
the percpu_ref is between init and exit and thus there are valid valid
usage scenarios where these new functions race with each other or
against reinit/kill.  Mostly from inertia, f47ad4578461 ("percpu_ref:
decouple switching to percpu mode and reinit") still left
synchronization among percpu mode switching operations to its users.

That the new switch functions can be freely mixed with kill/reinit but
the operations themselves should be synchronized is too subtle a
requirement and led to a very subtle race condition in blk-mq freezing
path.

This patch fixes the situation by introducing percpu_ref_switch_lock
to protect mode switching operations.  This ensures that percpu-ref
users don't have to worry about mode changing operations racing
against each other, e.g. switch_to_percpu against kill, as long as the
sequence of operations is valid.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Akinobu Mita <akinobu.mita@gmail.com>
Link: http://lkml.kernel.org/g/1443287365-4244-7-git-send-email-akinobu.mita@gmail.com
Fixes: f47ad4578461 ("percpu_ref: decouple switching to percpu mode and reinit")
---
 lib/percpu-refcount.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index f3ff793691ac..c69938e4b0d5 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,6 +33,7 @@
 
 #define PERCPU_COUNT_BIAS	(1LU << (BITS_PER_LONG - 1))
 
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
 static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
 
 static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -208,15 +209,15 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 static void __percpu_ref_switch_mode(struct percpu_ref *ref,
 				     percpu_ref_func_t *confirm_switch)
 {
+	lockdep_assert_held(&percpu_ref_switch_lock);
+
 	/*
 	 * If the previous ATOMIC switching hasn't finished yet, wait for
 	 * its completion.  If the caller ensures that ATOMIC switching
 	 * isn't in progress, this function can be called from any context.
-	 * Do an extra confirm_switch test to circumvent the unconditional
-	 * might_sleep() in wait_event().
 	 */
-	if (ref->confirm_switch)
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+	wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+			    percpu_ref_switch_lock);
 
 	if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
 		__percpu_ref_switch_to_atomic(ref, confirm_switch);
@@ -247,8 +248,14 @@ static void __percpu_ref_switch_mode(struct percpu_ref *ref,
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	ref->force_atomic = true;
 	__percpu_ref_switch_mode(ref, confirm_switch);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 
 /**
@@ -271,8 +278,14 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
  */
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	ref->force_atomic = false;
 	__percpu_ref_switch_mode(ref, NULL);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 
 /**
@@ -293,12 +306,18 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
 		  "%s called more than once on %pf!", __func__, ref->release);
 
 	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
 	__percpu_ref_switch_mode(ref, confirm_kill);
 	percpu_ref_put(ref);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
 
@@ -315,10 +334,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
  */
 void percpu_ref_reinit(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ON_ONCE(!percpu_ref_is_zero(ref));
 
 	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
 	percpu_ref_get(ref);
 	__percpu_ref_switch_mode(ref, NULL);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_reinit);
-- 
cgit v1.2.3


From a67823c1ed1092160da94c31e6da5aeb35dca81c Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Thu, 11 Aug 2016 19:27:09 +0200
Subject: percpu-refcount: init ->confirm_switch member properly

This patch targets two things which are related to ->confirm_switch:

 1. Init ->confirm_switch pointer with NULL on percpu_ref_init() or
    kernel frightfully complains with WARN_ON_ONCE(ref->confirm_switch)
    at __percpu_ref_switch_to_atomic if memory chunk was not properly
    zeroed.

 2. Warn if RCU callback is still in progress on percpu_ref_exit().
    The race still exists, because percpu_ref_call_confirm_rcu()
    drops ->confirm_switch to NULL early, but that is only a warning
    and still the caller is responsible that ref is no longer in
    active use.  Hopefully that can help to catch incorrect usage
    of percpu-refcount.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index c69938e4b0d5..9ac959ef4cae 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -83,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 	atomic_long_set(&ref->count, start_count);
 
 	ref->release = release;
+	ref->confirm_switch = NULL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -102,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 
 	if (percpu_count) {
+		/* non-NULL confirm_switch indicates switching in progress */
+		WARN_ON_ONCE(ref->confirm_switch);
 		free_percpu(percpu_count);
 		ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
 	}
-- 
cgit v1.2.3


From 1b5ca12127427c51be605a75ecd0141eb3357249 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 22 Sep 2016 11:55:54 -0400
Subject: percpu: improve generic percpu modify-return implementation

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
To: Tejun Heo <tj@kernel.org>
To: Christoph Lameter <cl@linux.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233c4ba8..40e887068da2 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
cgit v1.2.3


From 799bc3c51b2b120ca6e59e702ede23fff2efaf43 Mon Sep 17 00:00:00 2001
From: Lance Richardson <lrichard@redhat.com>
Date: Thu, 22 Sep 2016 10:03:57 -0400
Subject: percpu: eliminate two sparse warnings

Fix two cases where a __percpu pointer cast drops __percpu.

Signed-off-by: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/percpu.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e02e3f80d363..84f58de08c2b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -521,7 +521,8 @@ do {									\
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
                         const unsigned long __percpu *addr)
 {
-	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+	unsigned long __percpu *a =
+		(unsigned long __percpu *)addr + nr / BITS_PER_LONG;
 
 #ifdef CONFIG_X86_64
 	return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 	asm volatile("bt "__percpu_arg(2)",%1\n\t"
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
-			: "m" (*(unsigned long *)addr), "Ir" (nr));
+			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
 
 	return oldbit;
 }
-- 
cgit v1.2.3


From 93c76b6b2faaad7bfbc0cda840763aa4819ef26e Mon Sep 17 00:00:00 2001
From: zijun_hu <zijun_hu@htc.com>
Date: Wed, 5 Oct 2016 21:19:11 +0800
Subject: mm/percpu.c: correct max_distance calculation for
 pcpu_embed_first_chunk()

pcpu_embed_first_chunk() calculates the range a percpu chunk spans into
@max_distance and uses it to ensure that a chunk is not too big compared
to the total vmalloc area. However, during calculation, it used incorrect
top address by adding a unit size to the highest group's base address.

This can make the calculated max_distance slightly smaller than the actual
distance although given the scale of values involved the error is very
unlikely to have an actual impact.

Fix this issue by adding the group's size instead of a unit size.

BTW, The type of variable max_distance is changed from size_t to unsigned
long too based on below consideration:
 - type unsigned long usually have same width with IP core registers and
   can be applied at here very well
 - make @max_distance type consistent with the operand calculated against
   it such as @ai->groups[i].base_offset and macro VMALLOC_TOTAL
 - type unsigned long is more universal then size_t, size_t is type defined
   to unsigned int or unsigned long among various ARCHs usually

Signed-off-by: zijun_hu <zijun_hu@htc.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 9903830aaebb..e2737e56b017 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1961,7 +1961,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	void *base = (void *)ULONG_MAX;
 	void **areas = NULL;
 	struct pcpu_alloc_info *ai;
-	size_t size_sum, areas_size, max_distance;
+	size_t size_sum, areas_size;
+	unsigned long max_distance;
 	int group, i, rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -2023,17 +2024,18 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	/* base address is now known, determine group base offsets */
-	max_distance = 0;
+	i = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		ai->groups[group].base_offset = areas[group] - base;
-		max_distance = max_t(size_t, max_distance,
-				     ai->groups[group].base_offset);
+		if (areas[group] > areas[i])
+			i = group;
 	}
-	max_distance += ai->unit_size;
+	max_distance = ai->groups[i].base_offset +
+		ai->unit_size * ai->groups[i].nr_units;
 
 	/* warn if maximum distance is further than 75% of vmalloc space */
 	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
-		pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
+		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
 			max_distance, VMALLOC_TOTAL);
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 		/* and fail if we have fallback */
-- 
cgit v1.2.3


From 9b7396624a7b503220d85428654634b60762f2b0 Mon Sep 17 00:00:00 2001
From: zijun_hu <zijun_hu@htc.com>
Date: Wed, 5 Oct 2016 21:30:24 +0800
Subject: mm/percpu.c: fix potential memory leakage for
 pcpu_embed_first_chunk()

in order to ensure the percpu group areas within a chunk aren't
distributed too sparsely, pcpu_embed_first_chunk() goes to error handling
path when a chunk spans over 3/4 VMALLOC area, however, during the error
handling, it forget to free the memory allocated for all percpu groups by
going to label @out_free other than @out_free_areas.

it will cause memory leakage issue if the rare scene really happens, in
order to fix the issue, we check chunk spanned area immediately after
completing memory allocation for all percpu groups, we go to label
@out_free_areas to free the memory then return if the checking is failed.

in order to verify the approach, we dump all memory allocated then
enforce the jump then dump all memory freed, the result is okay after
checking whether we free all memory we allocate in this function.

BTW, The approach is chosen after thinking over the below scenes
 - we don't go to label @out_free directly to fix this issue since we
   maybe free several allocated memory blocks twice
 - the aim of jumping after pcpu_setup_first_chunk() is bypassing free
   usable memory other than handling error, moreover, the function does
   not return error code in any case, it either panics due to BUG_ON()
   or return 0.

Signed-off-by: zijun_hu <zijun_hu@htc.com>
Tested-by: zijun_hu <zijun_hu@htc.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index e2737e56b017..255714302394 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1963,7 +1963,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	struct pcpu_alloc_info *ai;
 	size_t size_sum, areas_size;
 	unsigned long max_distance;
-	int group, i, rc;
+	int group, i, highest_group, rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
 				   cpu_distance_fn);
@@ -1979,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		goto out_free;
 	}
 
-	/* allocate, copy and determine base address */
+	/* allocate, copy and determine base address & max_distance */
+	highest_group = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		struct pcpu_group_info *gi = &ai->groups[group];
 		unsigned int cpu = NR_CPUS;
@@ -2000,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		areas[group] = ptr;
 
 		base = min(ptr, base);
+		if (ptr > areas[highest_group])
+			highest_group = group;
+	}
+	max_distance = areas[highest_group] - base;
+	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+	/* warn if maximum distance is further than 75% of vmalloc space */
+	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+				max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+		/* and fail if we have fallback */
+		rc = -EINVAL;
+		goto out_free_areas;
+#endif
 	}
 
 	/*
@@ -2024,24 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	/* base address is now known, determine group base offsets */
-	i = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		ai->groups[group].base_offset = areas[group] - base;
-		if (areas[group] > areas[i])
-			i = group;
-	}
-	max_distance = ai->groups[i].base_offset +
-		ai->unit_size * ai->groups[i].nr_units;
-
-	/* warn if maximum distance is further than 75% of vmalloc space */
-	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
-		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
-			max_distance, VMALLOC_TOTAL);
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-		/* and fail if we have fallback */
-		rc = -EINVAL;
-		goto out_free;
-#endif
 	}
 
 	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
-- 
cgit v1.2.3